diff --git a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json
index 1b99a44..364bfee 100644
--- a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json
+++ b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7809527709426081
+ "macro_f1": 0.7769528027441275
},
"amazon_review_classification": {
- "macro_f1": 0.5155899232320224
+ "macro_f1": 0.5146406875677701
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8844781754440035
},
"massive_intent_classification": {
- "macro_f1": 0.7879373479249787
+ "macro_f1": 0.7872353730798753
},
"massive_scenario_classification": {
- "macro_f1": 0.8662625888023707
+ "macro_f1": 0.8639715373498098
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8350488266987821
+ },
+ "wrime_classification": {
+ "macro_f1": 0.3815230965003785
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9095168116460639
+ "ndcg@10": 0.909518320556229
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.5981293078380808
+ },
+ "jqara": {
+ "ndcg@10": 0.3719557553111225
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.6789908587925922
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8281088898171538
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.4085978545476503
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.42314124780036416
+ "ndcg@10": 0.43879890119990833
},
"jaqket": {
- "ndcg@10": 0.36199154051747723
+ "ndcg@10": 0.3555985699236658
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.1997740482697841
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.16521386136598404
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.12060735418211223
},
"mrtydi": {
- "ndcg@10": 0.07810683176415421
+ "ndcg@10": 0.07107405961190999
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.5430415601583998
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.6077212544951452
+ "ndcg@10": 0.5585881454407594
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.6433890489201118
+ "ndcg@10": 0.629620778788499
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.39317174536190913
+ "ndcg@10": 0.3517328767423871
}
},
"STS": {
"jsick": {
- "spearman": 0.754165277432144
+ "spearman": 0.7775668305928584
},
"jsts": {
- "spearman": 0.7558202366183716
+ "spearman": 0.7563460117163054
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.4966545453348478
+ "v_measure_score": 0.4601335671191492
},
"mewsc16": {
- "v_measure_score": 0.3877356318022785
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6237623762376237
+ "v_measure_score": 0.39000718680465274
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3456006554316726
}
}
}
\ No newline at end of file
diff --git a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json
index ea227c2..20150c2 100644
--- a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json
+++ b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.776174162517931
+ "macro_f1": 0.7779156199278396
},
"amazon_review_classification": {
- "macro_f1": 0.5085781180553806
+ "macro_f1": 0.5111451768867725
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8782111274457993
},
"massive_intent_classification": {
- "macro_f1": 0.7718541530739129
+ "macro_f1": 0.7796973463634825
},
"massive_scenario_classification": {
- "macro_f1": 0.8592571786794985
+ "macro_f1": 0.8634142669499835
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8506408877596591
+ },
+ "wrime_classification": {
+ "macro_f1": 0.3656175961601361
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9100551950168166
+ "ndcg@10": 0.9092446252246911
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.605113846464576
+ },
+ "jqara": {
+ "ndcg@10": 0.36840730960684165
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.693114284522583
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8530771666734125
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.42431895793525753
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.42368135774043536
+ "ndcg@10": 0.43601956332213093
},
"jaqket": {
- "ndcg@10": 0.37721850397542034
+ "ndcg@10": 0.37354035206874886
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.2518443007449429
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.14756204576714857
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.16862391555076126
},
"mrtydi": {
- "ndcg@10": 0.07878085186566607
+ "ndcg@10": 0.07770347901718931
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.5689006657309228
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.636999375405723
+ "ndcg@10": 0.5911474254499767
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.6413498649875696
+ "ndcg@10": 0.618101892252404
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.397250919496823
+ "ndcg@10": 0.3287673013916751
}
},
"STS": {
"jsick": {
- "spearman": 0.7756925231422259
+ "spearman": 0.7893346270810556
},
"jsts": {
- "spearman": 0.7652968548841591
+ "spearman": 0.7657111966582518
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5262387436934941
+ "v_measure_score": 0.4498663842342549
},
"mewsc16": {
- "v_measure_score": 0.37277574537292835
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.623321554770318
+ "v_measure_score": 0.4319848997472401
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3860004176729398
}
}
}
\ No newline at end of file
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json
index dbed068..ebc1037 100644
--- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json
+++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7619809437515043
+ "macro_f1": 0.7430232193667698
},
"amazon_review_classification": {
- "macro_f1": 0.5205592432502059
+ "macro_f1": 0.5196833867285527
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8969457721352727
},
"massive_intent_classification": {
- "macro_f1": 0.7789367871593064
+ "macro_f1": 0.7782504182162112
},
"massive_scenario_classification": {
- "macro_f1": 0.8490320705866646
+ "macro_f1": 0.8459551634050977
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8382321236746973
+ },
+ "wrime_classification": {
+ "macro_f1": 0.3814631725334783
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9065584234991577
+ "ndcg@10": 0.906706098295787
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.581551030502223
+ },
+ "jqara": {
+ "ndcg@10": 0.3666097794082717
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.6908907697836885
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8615323536010276
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.39917758524262303
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.4411487123884245
+ "ndcg@10": 0.4460371569059824
},
"jaqket": {
- "ndcg@10": 0.39613283459361814
+ "ndcg@10": 0.3845053301501902
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.2239147895010841
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.13942471586306499
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.139069576010256
},
"mrtydi": {
- "ndcg@10": 0.08154879873415645
+ "ndcg@10": 0.07299085059942924
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.5835049460335981
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.6276035246534508
+ "ndcg@10": 0.5863133806218087
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.5838785018803183
+ "ndcg@10": 0.5743459511193183
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.3489329387182086
+ "ndcg@10": 0.32465205260710006
}
},
"STS": {
"jsick": {
- "spearman": 0.7463567093877269
+ "spearman": 0.7525289500265361
},
"jsts": {
- "spearman": 0.7468283806971927
+ "spearman": 0.7466329702466956
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.41041888940251137
+ "v_measure_score": 0.45840176801621957
},
"mewsc16": {
- "v_measure_score": 0.45175891401665724
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6236711552090717
+ "v_measure_score": 0.4407932537977668
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.38669286929581886
}
}
}
\ No newline at end of file
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json
index 9528312..e1c3e9c 100644
--- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json
+++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7619809437515043
+ "macro_f1": 0.7640029182013914
},
"amazon_review_classification": {
- "macro_f1": 0.5152108946679324
+ "macro_f1": 0.5165133824101508
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8785996540635361
},
"massive_intent_classification": {
- "macro_f1": 0.7895128475562229
+ "macro_f1": 0.7815141648175687
},
"massive_scenario_classification": {
- "macro_f1": 0.865430249169577
+ "macro_f1": 0.8643739735863134
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8179797886754027
+ },
+ "wrime_classification": {
+ "macro_f1": 0.37929751450328747
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9115815294581953
+ "ndcg@10": 0.9116742957456255
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.6540921936468603
+ },
+ "jqara": {
+ "ndcg@10": 0.3839109493881204
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7018821974047713
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8442037101394532
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.4895140949755706
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.47387768939865055
+ "ndcg@10": 0.48413330907538854
},
"jaqket": {
- "ndcg@10": 0.3956683977353904
+ "ndcg@10": 0.3872950509227257
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.25723625707011927
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.2159968215066114
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.18105368261359917
},
"mrtydi": {
- "ndcg@10": 0.1144234568266308
+ "ndcg@10": 0.11016096912346693
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.5890880676571459
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.6416096544574569
+ "ndcg@10": 0.6005134171957127
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.7023477497744102
+ "ndcg@10": 0.691482229451667
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.4536720868647063
+ "ndcg@10": 0.377200379602747
}
},
"STS": {
"jsick": {
- "spearman": 0.781770693640686
+ "spearman": 0.7914302448138066
},
"jsts": {
- "spearman": 0.7680617109850311
+ "spearman": 0.7677275529386515
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5301620892693397
+ "v_measure_score": 0.4272210847614043
},
"mewsc16": {
- "v_measure_score": 0.4034776723308173
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6238078417520311
+ "v_measure_score": 0.39391604411456593
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.2641681900458691
}
}
}
\ No newline at end of file
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json
index b36686c..dad1d0c 100644
--- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json
+++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7725250131648236
+ "macro_f1": 0.7767065011282246
},
"amazon_review_classification": {
- "macro_f1": 0.5341627023771393
+ "macro_f1": 0.5348080733659045
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8928165629175933
},
"massive_intent_classification": {
- "macro_f1": 0.7682863192709365
+ "macro_f1": 0.7678594675802368
},
"massive_scenario_classification": {
- "macro_f1": 0.8639396658321546
+ "macro_f1": 0.8624414954250645
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8376983111767246
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4088843388537483
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9094717381883379
+ "ndcg@10": 0.9093431066849924
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.6144762455614383
+ },
+ "jqara": {
+ "ndcg@10": 0.42466871751866847
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7065312090166875
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8742363417086798
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.4627911424268102
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.47038430326303626
+ "ndcg@10": 0.4824617060944974
},
"jaqket": {
- "ndcg@10": 0.44101304795602897
+ "ndcg@10": 0.4416882664197474
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.28888654887615833
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.1951539369285861
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.18656064853165188
},
"mrtydi": {
- "ndcg@10": 0.11429128335865787
+ "ndcg@10": 0.11438786651077741
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.439694854198857
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.43434267808785576
+ "ndcg@10": 0.40326645532241284
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.6240651697600803
+ "ndcg@10": 0.6048895627840009
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.3651687833824759
+ "ndcg@10": 0.36508949429446635
}
},
"STS": {
"jsick": {
- "spearman": 0.787528927058734
+ "spearman": 0.7876474308902304
},
"jsts": {
- "spearman": 0.7781413957931619
+ "spearman": 0.7782114794698556
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.48448646364489634
+ "v_measure_score": 0.5129910499369752
},
"mewsc16": {
- "v_measure_score": 0.43168522818790694
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6235418875927891
+ "v_measure_score": 0.46267377071476495
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3603960521680572
}
}
}
\ No newline at end of file
diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json
index f620d50..cad831e 100644
--- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json
+++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7635642561809131
+ "macro_f1": 0.7655145272700131
},
"amazon_review_classification": {
- "macro_f1": 0.5275222511867922
+ "macro_f1": 0.5273281594091623
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8821782850442395
},
"massive_intent_classification": {
- "macro_f1": 0.7688060073049678
+ "macro_f1": 0.772169445045981
},
"massive_scenario_classification": {
- "macro_f1": 0.8651446837233107
+ "macro_f1": 0.8625146467158739
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8145447793317748
+ },
+ "wrime_classification": {
+ "macro_f1": 0.40382215327142257
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9129851570116734
+ "ndcg@10": 0.9130235242422614
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.6513884390883999
+ },
+ "jqara": {
+ "ndcg@10": 0.44959095699445484
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7121442551193732
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8679395106334268
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.5316167737103407
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.5014367709991477
+ "ndcg@10": 0.5120263378587457
},
"jaqket": {
- "ndcg@10": 0.4583812630740073
+ "ndcg@10": 0.45810454318653493
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.30420713299186014
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.260782337674165
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.23652695166828322
},
"mrtydi": {
- "ndcg@10": 0.13003320802922363
+ "ndcg@10": 0.1306190778426387
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.5464834936384055
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.5508587506679636
+ "ndcg@10": 0.5213267121181618
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.7497069192695408
+ "ndcg@10": 0.7412764112062588
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.4524300499843447
+ "ndcg@10": 0.4220927003134505
}
},
"STS": {
"jsick": {
- "spearman": 0.7984403024596518
+ "spearman": 0.7985649981589037
},
"jsts": {
- "spearman": 0.7813685476201204
+ "spearman": 0.7813825399856615
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5319881995988209
+ "v_measure_score": 0.5491083580906443
},
"mewsc16": {
- "v_measure_score": 0.4330807170988368
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6226614895870103
+ "v_measure_score": 0.4267958807672512
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3178045302473092
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-base-v2/summary.json b/docs/results/cl-nagoya/ruri-base-v2/summary.json
new file mode 100644
index 0000000..c090ce8
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-base-v2/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.7597182825660609
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5554544939941979
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9235657959062215
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.8092593406289539
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8886710878440421
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8926416828413609
+ },
+ "wrime_classification": {
+ "macro_f1": 0.461674192977988
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9317155624145913
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8576025511447865
+ },
+ "jqara": {
+ "ndcg@10": 0.6066458919871698
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.842561072326263
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8846847676615118
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8101096413526069
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7590325308586044
+ },
+ "jaqket": {
+ "ndcg@10": 0.5700921243106366
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.4417665675636218
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6821942595823656
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.3773323411085737
+ },
+ "mrtydi": {
+ "ndcg@10": 0.4088554217076187
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8805294567802572
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.8973083823806287
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9696059096853805
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.789314612552914
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.8262585834114126
+ },
+ "jsts": {
+ "spearman": 0.8343314248100878
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5437561090974637
+ },
+ "mewsc16": {
+ "v_measure_score": 0.5060934807171409
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3553392136864812
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-base/summary.json b/docs/results/cl-nagoya/ruri-base/summary.json
index a7c7b05..591ccd2 100644
--- a/docs/results/cl-nagoya/ruri-base/summary.json
+++ b/docs/results/cl-nagoya/ruri-base/summary.json
@@ -4,59 +4,93 @@
"macro_f1": 0.7665550732749669
},
"amazon_review_classification": {
- "macro_f1": 0.5575876111411316
+ "macro_f1": 0.5602315794414631
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.916854859845768
},
"massive_intent_classification": {
- "macro_f1": 0.8141210121425055
+ "macro_f1": 0.8122217429688374
},
"massive_scenario_classification": {
- "macro_f1": 0.8848812917656395
+ "macro_f1": 0.8861454528496383
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8773434580133629
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4546702469392619
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9290942178703699
+ "ndcg@10": 0.9291919623555276
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8723926273423869
+ },
+ "jqara": {
+ "ndcg@10": 0.5415330056104515
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7921821114257664
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8801076117078023
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8247892121220626
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.7455660589538348
+ "ndcg@10": 0.7550451217031677
},
"jaqket": {
- "ndcg@10": 0.5012253145754781
+ "ndcg@10": 0.5023277717264268
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.45371270319906437
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.5488453168704391
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.35421737773497164
},
"mrtydi": {
- "ndcg@10": 0.3545113073009125
+ "ndcg@10": 0.3558845666232437
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8664858820958761
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8689204088388403
+ "ndcg@10": 0.8723253192804757
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9656989703684407
+ "ndcg@10": 0.952690372948545
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.7531306059721564
+ "ndcg@10": 0.7624967518065642
}
},
"STS": {
"jsick": {
- "spearman": 0.8231772134744029
+ "spearman": 0.8232158602892652
},
"jsts": {
- "spearman": 0.8342848039994751
+ "spearman": 0.8343499347567392
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5427223607801758
+ "v_measure_score": 0.5669485444435229
},
"mewsc16": {
- "v_measure_score": 0.5404099864321413
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6237623762376238
+ "v_measure_score": 0.5205022529269108
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3854934527391879
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-large-v2/summary.json b/docs/results/cl-nagoya/ruri-large-v2/summary.json
new file mode 100644
index 0000000..e4a22b7
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-large-v2/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.7950890220234579
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5708906806011181
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.935661827685557
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.8087242075730218
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8970775785938794
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8471804883814585
+ },
+ "wrime_classification": {
+ "macro_f1": 0.47233151152826275
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9321133927024134
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8529056816630052
+ },
+ "jqara": {
+ "ndcg@10": 0.644692559122629
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.857799148388121
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9068464851749977
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8048616669652183
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7822527313926262
+ },
+ "jaqket": {
+ "ndcg@10": 0.6561070613824674
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.5040548535978852
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.7046000072363299
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.36969618230893564
+ },
+ "mrtydi": {
+ "ndcg@10": 0.4636780745156557
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9085158509835447
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9114732359476821
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.977434890774318
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.8232131912662143
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.8212250726981067
+ },
+ "jsts": {
+ "spearman": 0.8424300570470996
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5562089376369613
+ },
+ "mewsc16": {
+ "v_measure_score": 0.509675337301281
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4605817648504685
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-large/summary.json b/docs/results/cl-nagoya/ruri-large/summary.json
index e86c46b..2e2cead 100644
--- a/docs/results/cl-nagoya/ruri-large/summary.json
+++ b/docs/results/cl-nagoya/ruri-large/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.8080806321853091
+ "macro_f1": 0.7950391460082398
},
"amazon_review_classification": {
- "macro_f1": 0.5680171450057119
+ "macro_f1": 0.5685184036314727
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9356380708493385
},
"massive_intent_classification": {
- "macro_f1": 0.8255898596881264
+ "macro_f1": 0.8209962603450597
},
"massive_scenario_classification": {
- "macro_f1": 0.8956410349938264
+ "macro_f1": 0.9002551808707712
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.852564312646895
+ },
+ "wrime_classification": {
+ "macro_f1": 0.46447181564392015
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9298524733536755
+ "ndcg@10": 0.9298778327436324
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8661076138203823
+ },
+ "jqara": {
+ "ndcg@10": 0.5958950681984889
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8022791978749706
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8690504682983363
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8169123630823522
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.7667506664925435
+ "ndcg@10": 0.7763829985024149
},
"jaqket": {
- "ndcg@10": 0.6173871224245404
+ "ndcg@10": 0.617343261611166
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.5106450721691843
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.5547009159538185
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.3476835812045506
},
"mrtydi": {
- "ndcg@10": 0.3803302462897418
+ "ndcg@10": 0.38120908812619875
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8652992529882778
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8712459719069233
+ "ndcg@10": 0.8891161860918603
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9657898747088243
+ "ndcg@10": 0.9617411892426375
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.779665053945222
+ "ndcg@10": 0.7922108957487803
}
},
"STS": {
"jsick": {
- "spearman": 0.8199959693684533
+ "spearman": 0.8199569498182433
},
"jsts": {
- "spearman": 0.8426164139167538
+ "spearman": 0.8426241685487486
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5139491572866559
+ "v_measure_score": 0.5443732953428371
},
"mewsc16": {
- "v_measure_score": 0.5225025331595674
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6228813559322034
+ "v_measure_score": 0.5058998835740889
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.44757212682292163
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-small-v2/summary.json b/docs/results/cl-nagoya/ruri-small-v2/summary.json
new file mode 100644
index 0000000..eec64ee
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-small-v2/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.7767065011282246
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5559888936165459
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8863640825159859
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.8199647165894474
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8816435555944846
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8156946375922746
+ },
+ "wrime_classification": {
+ "macro_f1": 0.452255956789983
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9320364061675573
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8818198634914105
+ },
+ "jqara": {
+ "ndcg@10": 0.5670420631375501
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8332825788093644
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9009377977029078
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8303842720270221
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7401670430071696
+ },
+ "jaqket": {
+ "ndcg@10": 0.6225429070303006
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3530718504041533
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6689773236918534
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.32577528652704146
+ },
+ "mrtydi": {
+ "ndcg@10": 0.42400768916861914
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9064650891678154
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9041671364705328
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9729556994161748
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.7821156819492701
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.8387675357095226
+ },
+ "jsts": {
+ "spearman": 0.8193470885317312
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5260577746749562
+ },
+ "mewsc16": {
+ "v_measure_score": 0.4947076915300828
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.47820319421479446
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-small/summary.json b/docs/results/cl-nagoya/ruri-small/summary.json
index cb591ea..079db3e 100644
--- a/docs/results/cl-nagoya/ruri-small/summary.json
+++ b/docs/results/cl-nagoya/ruri-small/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7991935990685706
+ "macro_f1": 0.8055421233612723
},
"amazon_review_classification": {
- "macro_f1": 0.556129066893332
+ "macro_f1": 0.5541385299441624
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8885932202820669
},
"massive_intent_classification": {
- "macro_f1": 0.8148895285345188
+ "macro_f1": 0.8108237159349728
},
"massive_scenario_classification": {
- "macro_f1": 0.8787774569382543
+ "macro_f1": 0.8800077744996155
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.839667353042202
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4595261443020403
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9300177985352138
+ "ndcg@10": 0.9301438020851305
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8766726074179287
+ },
+ "jqara": {
+ "ndcg@10": 0.5325863556709908
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7783787989685144
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8813650067339368
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.825837748200516
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.736494039429321
+ "ndcg@10": 0.740126693753929
},
"jaqket": {
- "ndcg@10": 0.484437639428696
+ "ndcg@10": 0.4844203596195783
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3723496207549938
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.5222032466588368
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.2898890422890513
},
"mrtydi": {
- "ndcg@10": 0.3342716158897666
+ "ndcg@10": 0.3351374258570715
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8689213841203763
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8768878489670099
+ "ndcg@10": 0.8723259697162892
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9716879343439146
+ "ndcg@10": 0.9619567235021281
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.7608660955794895
+ "ndcg@10": 0.7608782792491423
}
},
"STS": {
"jsick": {
- "spearman": 0.8343927017558587
+ "spearman": 0.8344934497771457
},
"jsts": {
- "spearman": 0.8213297790184827
+ "spearman": 0.8213145808052514
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5096442244018489
+ "v_measure_score": 0.5289736036070719
},
"mewsc16": {
- "v_measure_score": 0.5141045788711239
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6211267605633802
+ "v_measure_score": 0.4936801242208388
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.46507426407220503
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-v3-130m/summary.json b/docs/results/cl-nagoya/ruri-v3-130m/summary.json
new file mode 100644
index 0000000..5700f32
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-v3-130m/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.7674793827265536
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5955994619477079
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9500285886600925
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.807938642045445
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8790346026671575
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8287806075978352
+ },
+ "wrime_classification": {
+ "macro_f1": 0.46634901067800855
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9336981049156847
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8864670177419038
+ },
+ "jqara": {
+ "ndcg@10": 0.663018840039673
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.865876689917921
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9362058245511219
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8421113535976967
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7532393338902414
+ },
+ "jaqket": {
+ "ndcg@10": 0.730979460582779
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.5177034569356731
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.7100959869376436
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.45158335316076936
+ },
+ "mrtydi": {
+ "ndcg@10": 0.4780012151028164
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.995144547086835
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9887952520028016
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9795152116360624
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.9628103840588119
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.7885956280300046
+ },
+ "jsts": {
+ "spearman": 0.8323603869543141
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5436288048604071
+ },
+ "mewsc16": {
+ "v_measure_score": 0.4883532965483729
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.5019988844015973
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-v3-30m/summary.json b/docs/results/cl-nagoya/ruri-v3-30m/summary.json
new file mode 100644
index 0000000..c4e768a
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-v3-30m/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.7559571782387728
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5570789457429248
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9262839486939813
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.783074979041957
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8672396605716526
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8140481078951145
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4311261750368354
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9305651903486406
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8761294751423317
+ },
+ "jqara": {
+ "ndcg@10": 0.5747490185208084
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8352458113588647
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9297421530365237
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.827028266156452
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7020872105862214
+ },
+ "jaqket": {
+ "ndcg@10": 0.6244733500896729
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.4304756847175998
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6498916988979277
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.4577076048703079
+ },
+ "mrtydi": {
+ "ndcg@10": 0.41775750844113785
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9876046427100846
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9916030162169887
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9699245797579602
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.9534027111106339
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.8161946935797372
+ },
+ "jsts": {
+ "spearman": 0.819463211043541
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5369067977199252
+ },
+ "mewsc16": {
+ "v_measure_score": 0.47961175798341066
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4804316290090649
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-v3-310m/summary.json b/docs/results/cl-nagoya/ruri-v3-310m/summary.json
new file mode 100644
index 0000000..c27fed8
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-v3-310m/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.8009270010529765
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.6071898527482484
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9530657500380437
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.8176293812793415
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.890051922198645
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8812655271153628
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4852854023445756
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9342725351989479
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8845859005757672
+ },
+ "jqara": {
+ "ndcg@10": 0.6893206802955604
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8500853284469898
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9335769070370818
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8406411130636801
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7648595155366429
+ },
+ "jaqket": {
+ "ndcg@10": 0.7186721885111346
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.5225348075920366
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.677145342243983
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.43425275955863796
+ },
+ "mrtydi": {
+ "ndcg@10": 0.47064490316120666
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9958682142366949
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9935172926595653
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9790717306095701
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.9658294271714906
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.7886332339318622
+ },
+ "jsts": {
+ "spearman": 0.8430847366018317
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5855988614657296
+ },
+ "mewsc16": {
+ "v_measure_score": 0.4860478393120035
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4440626045366051
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-v3-70m/summary.json b/docs/results/cl-nagoya/ruri-v3-70m/summary.json
new file mode 100644
index 0000000..3a2c52d
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-v3-70m/summary.json
@@ -0,0 +1,96 @@
+{
+ "Classification": {
+ "amazon_counterfactual_classification": {
+ "macro_f1": 0.8180877928218353
+ },
+ "amazon_review_classification": {
+ "macro_f1": 0.5798379850008339
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9339140455312027
+ },
+ "massive_intent_classification": {
+ "macro_f1": 0.7891754112354649
+ },
+ "massive_scenario_classification": {
+ "macro_f1": 0.8782518076402043
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.7686616284901401
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4437562280187194
+ }
+ },
+ "Reranking": {
+ "esci": {
+ "ndcg@10": 0.9320237969329785
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8748197118530385
+ },
+ "jqara": {
+ "ndcg@10": 0.6309432249818713
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8503057292439823
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9225778620264797
+ }
+ },
+ "Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8275893500639571
+ },
+ "jagovfaqs_22k": {
+ "ndcg@10": 0.7327144021448485
+ },
+ "jaqket": {
+ "ndcg@10": 0.6768047159335538
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.4626106409683068
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6797764462851262
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.43554376517918675
+ },
+ "mrtydi": {
+ "ndcg@10": 0.4499999994407917
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.984966699117648
+ },
+ "nlp_journal_abs_intro": {
+ "ndcg@10": 0.9868218521221748
+ },
+ "nlp_journal_title_abs": {
+ "ndcg@10": 0.9706955197203543
+ },
+ "nlp_journal_title_intro": {
+ "ndcg@10": 0.9573354583951488
+ }
+ },
+ "STS": {
+ "jsick": {
+ "spearman": 0.7909930894957667
+ },
+ "jsts": {
+ "spearman": 0.828242284804404
+ }
+ },
+ "Clustering": {
+ "livedoor_news": {
+ "v_measure_score": 0.5492094636693866
+ },
+ "mewsc16": {
+ "v_measure_score": 0.47739615416643866
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4719940146272088
+ }
+ }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json
index 42cc5ff..45ec65b 100644
--- a/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json
+++ b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7234436301724776
+ "macro_f1": 0.7192545517004465
},
"amazon_review_classification": {
- "macro_f1": 0.5441445333270086
+ "macro_f1": 0.5454422812215437
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9100588500656168
},
"massive_intent_classification": {
- "macro_f1": 0.7951973953020242
+ "macro_f1": 0.8011172170046241
},
"massive_scenario_classification": {
- "macro_f1": 0.8760200177186923
+ "macro_f1": 0.8762609424720998
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8191722798191963
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4188203301151871
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9183455876236017
+ "ndcg@10": 0.9184207070049463
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.6426611140199804
+ },
+ "jqara": {
+ "ndcg@10": 0.3748362133870952
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7087840971938433
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8734013475096433
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.5331630522529377
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.5161990612242935
+ "ndcg@10": 0.5202480516932524
},
"jaqket": {
- "ndcg@10": 0.5024513438428565
+ "ndcg@10": 0.5013089667314551
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3288294149496304
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.20681341934572967
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.24700329716018354
},
"mrtydi": {
- "ndcg@10": 0.13976323269046823
+ "ndcg@10": 0.141360680613414
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.6909104560170936
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.6807886421530585
+ "ndcg@10": 0.6619434888289687
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.6570889175649209
+ "ndcg@10": 0.6484407439307039
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.48219159577174137
+ "ndcg@10": 0.4696725603511326
}
},
"STS": {
"jsick": {
- "spearman": 0.8282816229512862
+ "spearman": 0.8283659349049672
},
"jsts": {
- "spearman": 0.8127259236647225
+ "spearman": 0.8126484380435667
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5266774168531417
+ "v_measure_score": 0.5248555489302708
},
"mewsc16": {
- "v_measure_score": 0.5091016872016825
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6256665481692143
+ "v_measure_score": 0.5339141639252604
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.49207894013578146
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json
index a2d8924..3d0bb71 100644
--- a/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json
+++ b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7321444865928852
+ "macro_f1": 0.7260568612881779
},
"amazon_review_classification": {
- "macro_f1": 0.5475800661400465
+ "macro_f1": 0.5455832826466495
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8942024454984163
},
"massive_intent_classification": {
- "macro_f1": 0.7922802742146243
+ "macro_f1": 0.792273118014186
},
"massive_scenario_classification": {
- "macro_f1": 0.8772172454209797
+ "macro_f1": 0.8770657195206764
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8042709569831964
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4525777476393026
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9148471751378899
+ "ndcg@10": 0.9149640515619839
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.5614550878114778
+ },
+ "jqara": {
+ "ndcg@10": 0.38302855218604437
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7126433285790728
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8659821811381412
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.4370774500135088
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.4683673504170269
+ "ndcg@10": 0.47421467281855384
},
"jaqket": {
- "ndcg@10": 0.39878189118804513
+ "ndcg@10": 0.4004385277719307
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.376774984849213
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.18125969161337505
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.23480755788261093
},
"mrtydi": {
- "ndcg@10": 0.11834919561027905
+ "ndcg@10": 0.1188048690188868
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.6407825080386719
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.634254459552888
+ "ndcg@10": 0.6295135121177772
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.37927566884615427
+ "ndcg@10": 0.36949537039923136
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.25787534957423713
+ "ndcg@10": 0.2490316613470849
}
},
"STS": {
"jsick": {
- "spearman": 0.837959537101532
+ "spearman": 0.8377753687267541
},
"jsts": {
- "spearman": 0.825691902117111
+ "spearman": 0.8256006176068381
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5074967876488787
+ "v_measure_score": 0.5337915256082275
},
"mewsc16": {
- "v_measure_score": 0.503782014677764
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6250885896527285
+ "v_measure_score": 0.5111565926265328
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.45736658859438273
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json
index 3863c9e..bae07a1 100644
--- a/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json
+++ b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7330185800774036
+ "macro_f1": 0.7364790582283407
},
"amazon_review_classification": {
- "macro_f1": 0.5392887528271114
+ "macro_f1": 0.5413541626836352
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8986588956343088
},
"massive_intent_classification": {
- "macro_f1": 0.7907120296283751
+ "macro_f1": 0.7767897385750657
},
"massive_scenario_classification": {
- "macro_f1": 0.8597097942715117
+ "macro_f1": 0.8610390686035142
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8413013579577491
+ },
+ "wrime_classification": {
+ "macro_f1": 0.41309966752995253
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9115668272308735
+ "ndcg@10": 0.9117818311636607
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.5154239181007129
+ },
+ "jqara": {
+ "ndcg@10": 0.3218696921394324
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.6995597032253587
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8612256071032377
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.35106925427500363
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.46003459081522513
+ "ndcg@10": 0.4673719618749888
},
"jaqket": {
- "ndcg@10": 0.3945725593125862
+ "ndcg@10": 0.3951670829019162
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.299231152726057
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.10934136213023636
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.15981611825721914
},
"mrtydi": {
- "ndcg@10": 0.055507775092798486
+ "ndcg@10": 0.055133639963568334
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.582165240647806
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.6025847751308843
+ "ndcg@10": 0.5841104498413489
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.5562839869857912
+ "ndcg@10": 0.55577879846708
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.3449181162324482
+ "ndcg@10": 0.3284050897756761
}
},
"STS": {
"jsick": {
- "spearman": 0.7849379492955117
+ "spearman": 0.7852600594448598
},
"jsts": {
- "spearman": 0.7894946592483818
+ "spearman": 0.7894496424482047
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5223347838445698
+ "v_measure_score": 0.4936084943071576
},
"mewsc16": {
- "v_measure_score": 0.37310458219601117
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.624424778761062
+ "v_measure_score": 0.3743327976467685
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3592578922316612
}
}
}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json
index d37618a..09525c9 100644
--- a/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json
+++ b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.767905114979583
+ "macro_f1": 0.7640316468319925
},
"amazon_review_classification": {
- "macro_f1": 0.5537089641846143
+ "macro_f1": 0.5504736753163985
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9057099704855596
},
"massive_intent_classification": {
- "macro_f1": 0.7912698845073401
+ "macro_f1": 0.792495956569193
},
"massive_scenario_classification": {
- "macro_f1": 0.8736185210672394
+ "macro_f1": 0.8749858164207054
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8288719236604842
+ },
+ "wrime_classification": {
+ "macro_f1": 0.44326523397693174
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9095494729022622
+ "ndcg@10": 0.9094836571513687
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.5417192948613557
+ },
+ "jqara": {
+ "ndcg@10": 0.3877939946491903
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7001887861606321
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8303617273610736
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.37613574135010835
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.4509073581555124
+ "ndcg@10": 0.46564010373437337
},
"jaqket": {
- "ndcg@10": 0.34595043675331943
+ "ndcg@10": 0.3452888488420233
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3058130510308383
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.10326154138228141
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.12550430031143336
},
"mrtydi": {
- "ndcg@10": 0.05750859876901772
+ "ndcg@10": 0.057502989435967655
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.504469050615059
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.550742021417855
+ "ndcg@10": 0.5069650402920987
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.6307172007359215
+ "ndcg@10": 0.6043158227609278
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.39612451822677164
+ "ndcg@10": 0.34323430832579677
}
},
"STS": {
"jsick": {
- "spearman": 0.8014979086154339
+ "spearman": 0.8013849170804103
},
"jsts": {
- "spearman": 0.8097685749017456
+ "spearman": 0.809789575264219
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5090447587797094
+ "v_measure_score": 0.5147732775967515
},
"mewsc16": {
- "v_measure_score": 0.4591920015613856
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6248671625929861
+ "v_measure_score": 0.44443267597570074
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.34646662604886447
}
}
}
\ No newline at end of file
diff --git a/docs/results/colorfulscoop/sbert-base-ja/summary.json b/docs/results/colorfulscoop/sbert-base-ja/summary.json
index 2a08044..0f2bf84 100644
--- a/docs/results/colorfulscoop/sbert-base-ja/summary.json
+++ b/docs/results/colorfulscoop/sbert-base-ja/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7221023294352484
+ "macro_f1": 0.7080315613053877
},
"amazon_review_classification": {
- "macro_f1": 0.47952384496155054
+ "macro_f1": 0.4779713813897666
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8350239953633378
},
"massive_intent_classification": {
- "macro_f1": 0.725195343788811
+ "macro_f1": 0.7288673932703351
},
"massive_scenario_classification": {
- "macro_f1": 0.836177960542408
+ "macro_f1": 0.8370655127879382
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8262660922438109
+ },
+ "wrime_classification": {
+ "macro_f1": 0.35057897749310646
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.8997301146575819
+ "ndcg@10": 0.8996866702578056
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.37147215136686634
+ },
+ "jqara": {
+ "ndcg@10": 0.2220517076242275
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.6502702968219343
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8255483571039144
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.192984468642645
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.21501915127957166
+ "ndcg@10": 0.21704292684612675
},
"jaqket": {
- "ndcg@10": 0.13161989528541293
+ "ndcg@10": 0.13139887002144995
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.19067862146114167
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.018598782450328283
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.06972936265190934
},
"mrtydi": {
- "ndcg@10": 0.00436010196904899
+ "ndcg@10": 0.004126228941345733
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.29023294982669573
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.2878020264605714
+ "ndcg@10": 0.2580237968832312
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.22397059858982324
+ "ndcg@10": 0.21071404885072903
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.12815871897103842
+ "ndcg@10": 0.11573741610386916
}
},
"STS": {
"jsick": {
- "spearman": 0.6659298300713198
+ "spearman": 0.6656074999372202
},
"jsts": {
- "spearman": 0.7423952309826243
+ "spearman": 0.7425444938991701
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.4298579019834722
+ "v_measure_score": 0.4059869097583984
},
"mewsc16": {
- "v_measure_score": 0.46641671645082333
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6231013776050865
+ "v_measure_score": 0.46242491131769853
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.3035702180528845
}
}
}
\ No newline at end of file
diff --git a/docs/results/intfloat/multilingual-e5-base/summary.json b/docs/results/intfloat/multilingual-e5-base/summary.json
index 96f9640..4d84be2 100644
--- a/docs/results/intfloat/multilingual-e5-base/summary.json
+++ b/docs/results/intfloat/multilingual-e5-base/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.6367079139150691
+ "macro_f1": 0.6428957534047911
},
"amazon_review_classification": {
- "macro_f1": 0.5424265794470897
+ "macro_f1": 0.5417258327796466
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9231910434886872
},
"massive_intent_classification": {
- "macro_f1": 0.7277503514873049
+ "macro_f1": 0.7318717264077053
},
"massive_scenario_classification": {
- "macro_f1": 0.8652828949015864
+ "macro_f1": 0.8677940980663801
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.785022714268383
+ },
+ "wrime_classification": {
+ "macro_f1": 0.3865061394465788
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9285060467194839
+ "ndcg@10": 0.9290148108090969
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8865491934939191
+ },
+ "jqara": {
+ "ndcg@10": 0.4761308479065645
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8196779545649944
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8614612823139557
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8431602298737804
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.6534478396845428
+ "ndcg@10": 0.687214041967885
},
"jaqket": {
- "ndcg@10": 0.5067444792013236
+ "ndcg@10": 0.5169392915456349
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.34676383987252357
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6449511893902589
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.2573147838464383
},
"mrtydi": {
- "ndcg@10": 0.3837652120001251
+ "ndcg@10": 0.42298287793585587
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8355946539433561
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8709767034225332
+ "ndcg@10": 0.8447862631398672
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9473129303429082
+ "ndcg@10": 0.9461907998491789
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.7304538728893641
+ "ndcg@10": 0.7469571396756213
}
},
"STS": {
"jsick": {
- "spearman": 0.8128058660848744
+ "spearman": 0.8125544166626103
},
"jsts": {
- "spearman": 0.7839196475937381
+ "spearman": 0.7965480195299134
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5502694126615243
+ "v_measure_score": 0.5379041349111564
},
"mewsc16": {
- "v_measure_score": 0.41494514000218946
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6226482073127441
+ "v_measure_score": 0.4943772106331262
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4713134178805946
}
}
}
\ No newline at end of file
diff --git a/docs/results/intfloat/multilingual-e5-large/summary.json b/docs/results/intfloat/multilingual-e5-large/summary.json
index a28c470..40752a5 100644
--- a/docs/results/intfloat/multilingual-e5-large/summary.json
+++ b/docs/results/intfloat/multilingual-e5-large/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.706580687830688
+ "macro_f1": 0.6969861236021963
},
"amazon_review_classification": {
- "macro_f1": 0.5653992303516462
+ "macro_f1": 0.5763612743026115
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9554866923455646
},
"massive_intent_classification": {
- "macro_f1": 0.7577710251429624
+ "macro_f1": 0.7401244088033258
},
"massive_scenario_classification": {
- "macro_f1": 0.8859090262583831
+ "macro_f1": 0.887053685338159
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.7811476853348774
+ },
+ "wrime_classification": {
+ "macro_f1": 0.42377599926222737
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9296254722183955
+ "ndcg@10": 0.9330712866652149
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.9036816685131848
+ },
+ "jqara": {
+ "ndcg@10": 0.561374764136422
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8631195198401651
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8891328806594833
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8641271530674604
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.7030214336558751
+ "ndcg@10": 0.7297746711291291
},
"jaqket": {
- "ndcg@10": 0.5878065301444064
+ "ndcg@10": 0.5967326588135612
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3958992445664435
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.7095604570396511
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.2984972238105224
},
"mrtydi": {
- "ndcg@10": 0.4363167873386172
+ "ndcg@10": 0.4781603349494696
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8326468852967057
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8600225120389309
+ "ndcg@10": 0.8571088737195884
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9469712765040588
+ "ndcg@10": 0.952870249874937
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.7248023877969718
+ "ndcg@10": 0.7257268520360993
}
},
"STS": {
"jsick": {
- "spearman": 0.7840335060728089
+ "spearman": 0.7985423882395024
},
"jsts": {
- "spearman": 0.8098724997856234
+ "spearman": 0.8186303902222064
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5713023706914878
+ "v_measure_score": 0.5157643001398088
},
"mewsc16": {
- "v_measure_score": 0.4534484706354193
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.621496984746364
+ "v_measure_score": 0.46806674695304834
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.5334765362912619
}
}
}
\ No newline at end of file
diff --git a/docs/results/intfloat/multilingual-e5-small/summary.json b/docs/results/intfloat/multilingual-e5-small/summary.json
index 99a4423..af62c84 100644
--- a/docs/results/intfloat/multilingual-e5-small/summary.json
+++ b/docs/results/intfloat/multilingual-e5-small/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.6214130966524566
+ "macro_f1": 0.5866005078388893
},
"amazon_review_classification": {
- "macro_f1": 0.5127428912860463
+ "macro_f1": 0.5120598395740691
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8773239262941632
},
"massive_intent_classification": {
- "macro_f1": 0.7085230519111091
+ "macro_f1": 0.7134377059258787
},
"massive_scenario_classification": {
- "macro_f1": 0.8622036829599259
+ "macro_f1": 0.8676947906742417
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8177503141758454
+ },
+ "wrime_classification": {
+ "macro_f1": 0.36913347435432137
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9303349187158247
+ "ndcg@10": 0.9298402731760124
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8998812594907971
+ },
+ "jqara": {
+ "ndcg@10": 0.49280220404951935
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8178461260193638
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.864145360860429
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8558160940470637
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.6411252958220891
+ "ndcg@10": 0.6568760244912849
},
"jaqket": {
- "ndcg@10": 0.49966509556428645
+ "ndcg@10": 0.5157123960708363
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3153737960263929
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6323300168472976
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.2590832302769219
},
"mrtydi": {
- "ndcg@10": 0.36054822913647616
+ "ndcg@10": 0.4236692119753354
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.8396508926780583
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.8520749151982298
+ "ndcg@10": 0.8409842458346825
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9526123412781002
+ "ndcg@10": 0.9447219194706624
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.729906931983999
+ "ndcg@10": 0.7455737280382885
}
},
"STS": {
"jsick": {
- "spearman": 0.8150271836013705
+ "spearman": 0.8199946308873799
},
"jsts": {
- "spearman": 0.786450077409501
+ "spearman": 0.7892106647109823
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5470075389200084
+ "v_measure_score": 0.5904685845799247
},
"mewsc16": {
- "v_measure_score": 0.391226933590049
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6219382321618744
+ "v_measure_score": 0.5233814767010047
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.43592128019411325
}
}
}
\ No newline at end of file
diff --git a/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json
index 6b7309a..f9fbe6f 100644
--- a/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json
+++ b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7994675369288904
+ "macro_f1": 0.7972419438068292
},
"amazon_review_classification": {
- "macro_f1": 0.5748206591211895
+ "macro_f1": 0.575066739799988
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9207590996896718
},
"massive_intent_classification": {
- "macro_f1": 0.8025949222725076
+ "macro_f1": 0.8015558847211773
},
"massive_scenario_classification": {
- "macro_f1": 0.8875250742566655
+ "macro_f1": 0.8878291337617034
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.7731122315942124
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4573111522822367
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9156331205981866
+ "ndcg@10": 0.9157948249893592
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.674104660572769
+ },
+ "jqara": {
+ "ndcg@10": 0.35765029945439447
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.68225515961945
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8538476294446257
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.59611064776643
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.519938655947725
+ "ndcg@10": 0.5403287346696719
},
"jaqket": {
- "ndcg@10": 0.4206746951743811
+ "ndcg@10": 0.42113936906002564
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.2482827887837841
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.1928427319999251
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.19084474235068657
},
"mrtydi": {
- "ndcg@10": 0.10116108109776817
+ "ndcg@10": 0.10090455185771262
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.44067635335327865
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.4930421996747514
+ "ndcg@10": 0.44837143094362086
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.719369187830078
+ "ndcg@10": 0.7368252250653567
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.3258568875005778
+ "ndcg@10": 0.3115238718909808
}
},
"STS": {
"jsick": {
- "spearman": 0.7211422898060521
+ "spearman": 0.7203759702575281
},
"jsts": {
- "spearman": 0.8109305772255819
+ "spearman": 0.8107670759374308
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.4677177349822789
+ "v_measure_score": 0.4816771908212549
},
"mewsc16": {
- "v_measure_score": 0.5389209739242912
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6237623762376237
+ "v_measure_score": 0.5336022487793333
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.43034104597999767
}
}
}
\ No newline at end of file
diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
index 7318aab..6d1041e 100644
--- a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
+++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7492232749031491
+ "macro_f1": 0.7528271196943096
},
"amazon_review_classification": {
- "macro_f1": 0.5530707609927811
+ "macro_f1": 0.5518771080100612
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.892368025976312
},
"massive_intent_classification": {
- "macro_f1": 0.7979144461303402
+ "macro_f1": 0.7872725195473699
},
"massive_scenario_classification": {
- "macro_f1": 0.8683641924034757
+ "macro_f1": 0.8713846348082936
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8583089323083904
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4323129039345514
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9301469431250418
+ "ndcg@10": 0.9301525338489429
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8827390816541736
+ },
+ "jqara": {
+ "ndcg@10": 0.6070225247152883
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8243623644224994
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.887121388271364
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8385011452405416
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.6979374757372254
+ "ndcg@10": 0.6984652569482365
},
"jaqket": {
- "ndcg@10": 0.6729417850207029
+ "ndcg@10": 0.6751948574643762
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3957491894384977
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.652881832622734
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.3374776122444277
},
"mrtydi": {
- "ndcg@10": 0.41858579533990486
+ "ndcg@10": 0.4167021902708705
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.899055473429718
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.9029337913460675
+ "ndcg@10": 0.9008045583912581
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9511153967130517
+ "ndcg@10": 0.9566816164352073
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.7580448576047344
+ "ndcg@10": 0.757906107708436
}
},
"STS": {
"jsick": {
- "spearman": 0.849637366944316
+ "spearman": 0.8494858386977019
},
"jsts": {
- "spearman": 0.8095684318108997
+ "spearman": 0.8095670694135243
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5151536908540161
+ "v_measure_score": 0.5446091559116468
},
"mewsc16": {
- "v_measure_score": 0.45782610528001805
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.623716814159292
+ "v_measure_score": 0.4611859858929692
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.43979504978761347
}
}
-}
+}
\ No newline at end of file
diff --git a/docs/results/pkshatech/GLuCoSE-base-ja/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json
index 9048691..045be96 100644
--- a/docs/results/pkshatech/GLuCoSE-base-ja/summary.json
+++ b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.8243606275521169
+ "macro_f1": 0.8203088346974938
},
"amazon_review_classification": {
- "macro_f1": 0.580654308041878
+ "macro_f1": 0.5793470941382456
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9289309593569228
},
"massive_intent_classification": {
- "macro_f1": 0.7885427536904928
+ "macro_f1": 0.7852003872158392
},
"massive_scenario_classification": {
- "macro_f1": 0.8794225134482166
+ "macro_f1": 0.8771105186592234
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.7723533533184818
+ },
+ "wrime_classification": {
+ "macro_f1": 0.48820317778534994
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9190289767663239
+ "ndcg@10": 0.9182072351783757
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.7453523153562407
+ },
+ "jqara": {
+ "ndcg@10": 0.30235678517238046
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7782487998017047
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8742431547482784
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.6929937892822252
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.6387979415478197
+ "ndcg@10": 0.6414300605061649
},
"jaqket": {
- "ndcg@10": 0.3981609655991592
+ "ndcg@10": 0.39775627519142726
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.2981097485323552
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.4826861479972318
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.2507030467719784
},
"mrtydi": {
- "ndcg@10": 0.30281316435910444
+ "ndcg@10": 0.3013997193651328
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.7677861541704494
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.7825765249971093
+ "ndcg@10": 0.7720777474520221
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.8206371528870603
+ "ndcg@10": 0.8139955508348415
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.5982476164344701
+ "ndcg@10": 0.5843440022515908
}
},
"STS": {
"jsick": {
- "spearman": 0.7496711324072552
+ "spearman": 0.7489963692364312
},
"jsts": {
- "spearman": 0.824592262812859
+ "spearman": 0.8246470658338377
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.49890886040948096
+ "v_measure_score": 0.5022894818692664
},
"mewsc16": {
- "v_measure_score": 0.49676862904881375
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.663883089770355
+ "v_measure_score": 0.4952409837584659
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.41426282292221306
}
}
}
\ No newline at end of file
diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
index d82af4b..ed2a807 100644
--- a/docs/results/pkshatech/RoSEtta-base-ja/summary.json
+++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7005147244958231
+ "macro_f1": 0.7021400751808275
},
"amazon_review_classification": {
- "macro_f1": 0.5263680453119501
+ "macro_f1": 0.5222844241125081
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8728387064627037
},
"massive_intent_classification": {
- "macro_f1": 0.7983787583297884
+ "macro_f1": 0.7958661089844552
},
"massive_scenario_classification": {
- "macro_f1": 0.8709593192703351
+ "macro_f1": 0.869642477269303
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8400507949086808
+ },
+ "wrime_classification": {
+ "macro_f1": 0.41243251223612126
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9268625513429571
+ "ndcg@10": 0.9267798900027316
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8682926176464301
+ },
+ "jqara": {
+ "ndcg@10": 0.5792158527364997
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.8038167802919151
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8844542290758788
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8201713015308671
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.6595934642903105
+ "ndcg@10": 0.6624123008303046
},
"jaqket": {
- "ndcg@10": 0.6533452086105761
+ "ndcg@10": 0.6534322606981797
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3404237377925581
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6019862449112752
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.3236631225997826
},
"mrtydi": {
- "ndcg@10": 0.36731170141136216
+ "ndcg@10": 0.36773428568023436
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9604317247356383
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.9553567926226499
+ "ndcg@10": 0.9541194598644321
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.940828991756893
+ "ndcg@10": 0.931681815900694
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.8163161967769845
+ "ndcg@10": 0.821937205258955
}
},
"STS": {
"jsick": {
- "spearman": 0.8383455453168481
+ "spearman": 0.8383423614590403
},
"jsts": {
- "spearman": 0.7895388048564987
+ "spearman": 0.7894639448529204
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5861760622672214
+ "v_measure_score": 0.5617913119479273
},
"mewsc16": {
- "v_measure_score": 0.4784844036038961
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6173974540311173
+ "v_measure_score": 0.4515710456360326
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4060764834036522
}
}
-}
+}
\ No newline at end of file
diff --git a/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json
index cc9f179..08fb2cf 100644
--- a/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json
+++ b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.6748573563374541
+ "macro_f1": 0.6623457108919073
},
"amazon_review_classification": {
- "macro_f1": 0.5084883283463678
+ "macro_f1": 0.5085668578780138
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8727921560142167
},
"massive_intent_classification": {
- "macro_f1": 0.7967050091211104
+ "macro_f1": 0.7964832948145142
},
"massive_scenario_classification": {
- "macro_f1": 0.871999260591497
+ "macro_f1": 0.8722583552883876
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8118131918956941
+ },
+ "wrime_classification": {
+ "macro_f1": 0.38393198133793865
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.914930352019688
+ "ndcg@10": 0.9159934732688085
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.57455569033817
+ },
+ "jqara": {
+ "ndcg@10": 0.32394940899755914
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.7155007519708649
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.8749859006713937
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.4504173219483955
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.41496851385134836
+ "ndcg@10": 0.42184475447157466
},
"jaqket": {
- "ndcg@10": 0.46003031782136106
+ "ndcg@10": 0.45790883763166734
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.3129516236109114
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.16941478465313356
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.20077263817507693
},
"mrtydi": {
- "ndcg@10": 0.1019130492122431
+ "ndcg@10": 0.10152904724472846
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.3813451499418741
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.4014036990267884
+ "ndcg@10": 0.3760245554186644
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.5962532652358485
+ "ndcg@10": 0.5918422105100428
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.2452584471710635
+ "ndcg@10": 0.25260061985270044
}
},
"STS": {
"jsick": {
- "spearman": 0.7307715649457595
+ "spearman": 0.7310527928257868
},
"jsts": {
- "spearman": 0.8052279921326252
+ "spearman": 0.8050903530724467
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.4476707933600858
+ "v_measure_score": 0.4270804414565236
},
"mewsc16": {
- "v_measure_score": 0.5029508725037098
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6239830208701805
+ "v_measure_score": 0.5188641339887974
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4361034033018593
}
}
}
\ No newline at end of file
diff --git a/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json b/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json
index 30385ec..20e3090 100644
--- a/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json
+++ b/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7910202863961814
+ "macro_f1": 0.7966249319542733
},
"amazon_review_classification": {
- "macro_f1": 0.614759364446128
+ "macro_f1": 0.6204802870356217
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.9487448938934042
},
"massive_intent_classification": {
- "macro_f1": 0.8225880728874561
+ "macro_f1": 0.8121127783146885
},
"massive_scenario_classification": {
- "macro_f1": 0.9065030576701741
+ "macro_f1": 0.9015618520645106
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8262549610016919
+ },
+ "wrime_classification": {
+ "macro_f1": 0.496952794347916
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9374394712541568
+ "ndcg@10": 0.9359176437229035
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.8684667204236405
+ },
+ "jqara": {
+ "ndcg@10": 0.6592446626934351
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.851895889010368
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.9024168764200886
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.8242898079860301
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.7168374490004555
+ "ndcg@10": 0.7175523394252279
},
"jaqket": {
- "ndcg@10": 0.7279485535689915
+ "ndcg@10": 0.6651645644179811
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.6260117718497401
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.6324868715639211
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.3458953565848906
},
"mrtydi": {
- "ndcg@10": 0.41952210141116814
+ "ndcg@10": 0.4075091710258615
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.9919931534803926
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.9394095717236127
+ "ndcg@10": 0.9916030162169888
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.9695624263086593
+ "ndcg@10": 0.968506421217649
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.8832876426024624
+ "ndcg@10": 0.9629377323425067
}
},
"STS": {
"jsick": {
- "spearman": 0.8022484725822061
+ "spearman": 0.7979403746663343
},
"jsts": {
- "spearman": 0.851980317221987
+ "spearman": 0.8362521198880197
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.5641831341687762
+ "v_measure_score": 0.5580857807899353
},
"mewsc16": {
- "v_measure_score": 0.5129216698739159
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.62
+ "v_measure_score": 0.5068875864473731
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.4418928761777483
}
}
}
\ No newline at end of file
diff --git a/docs/results/sentence-transformers/LaBSE/summary.json b/docs/results/sentence-transformers/LaBSE/summary.json
index de8fd21..be25868 100644
--- a/docs/results/sentence-transformers/LaBSE/summary.json
+++ b/docs/results/sentence-transformers/LaBSE/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7361214773958769
+ "macro_f1": 0.7473900578785092
},
"amazon_review_classification": {
- "macro_f1": 0.516957890685124
+ "macro_f1": 0.5163267880432743
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8943804239578887
},
"massive_intent_classification": {
- "macro_f1": 0.7698802987251081
+ "macro_f1": 0.7708783013419095
},
"massive_scenario_classification": {
- "macro_f1": 0.8835366493433755
+ "macro_f1": 0.883882574111003
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8147469939175009
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4010561963802254
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.9162507647227857
+ "ndcg@10": 0.9160790861014678
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.6785244283016075
+ },
+ "jqara": {
+ "ndcg@10": 0.2488300870810785
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.6956277678029864
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.818396899799895
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.49122610922285737
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.4310160105414995
+ "ndcg@10": 0.4278654773482106
},
"jaqket": {
- "ndcg@10": 0.34245849139132745
+ "ndcg@10": 0.34162439290480445
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.20021150938693902
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.09640164259724278
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.07525879379433965
},
"mrtydi": {
- "ndcg@10": 0.04238747941951049
+ "ndcg@10": 0.04221321214455149
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.48063138821949475
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.48918127058907085
+ "ndcg@10": 0.48202233374429526
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.7513086500303519
+ "ndcg@10": 0.7559363652226313
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.35089108319096984
+ "ndcg@10": 0.3553481928114969
}
},
"STS": {
"jsick": {
- "spearman": 0.7698905918950973
+ "spearman": 0.770087314840748
},
"jsts": {
- "spearman": 0.7612337568248777
+ "spearman": 0.7611615118281959
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.4829337123233023
+ "v_measure_score": 0.48580223329334865
},
"mewsc16": {
- "v_measure_score": 0.41471299546625956
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.623321554770318
+ "v_measure_score": 0.41072280934417754
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.2859403214333406
}
}
}
\ No newline at end of file
diff --git a/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json
index 12f71a2..a856998 100644
--- a/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json
+++ b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json
@@ -1,62 +1,96 @@
{
"Classification": {
"amazon_counterfactual_classification": {
- "macro_f1": 0.7565022696601644
+ "macro_f1": 0.7514299930187799
},
"amazon_review_classification": {
- "macro_f1": 0.5131771609073525
+ "macro_f1": 0.5113667375588362
+ },
+ "japanese_sentiment_classification": {
+ "macro_f1": 0.8653980041773033
},
"massive_intent_classification": {
- "macro_f1": 0.7427818411370812
+ "macro_f1": 0.7433839585058197
},
"massive_scenario_classification": {
- "macro_f1": 0.8609512679368835
+ "macro_f1": 0.8606582397219589
+ },
+ "sib200_japanese_classification": {
+ "macro_f1": 0.8372998969612304
+ },
+ "wrime_classification": {
+ "macro_f1": 0.4167776597670575
}
},
"Reranking": {
"esci": {
- "ndcg@10": 0.901984958764163
+ "ndcg@10": 0.901890341341842
+ },
+ "jacwir_reranking": {
+ "ndcg@10": 0.39624043125448744
+ },
+ "jqara": {
+ "ndcg@10": 0.2151865684492273
+ },
+ "miracl_reranking": {
+ "ndcg@10": 0.659024218324574
+ },
+ "mldr_reranking": {
+ "ndcg@10": 0.768787823495723
}
},
"Retrieval": {
+ "jacwir_retrieval": {
+ "ndcg@10": 0.21305201715296948
+ },
"jagovfaqs_22k": {
- "ndcg@10": 0.2511106863952595
+ "ndcg@10": 0.2556849980765764
},
"jaqket": {
- "ndcg@10": 0.21606007987072834
+ "ndcg@10": 0.216905594393324
+ },
+ "mintaka_retrieval": {
+ "ndcg@10": 0.22312923127278733
+ },
+ "miracl_retrieval": {
+ "ndcg@10": 0.025873352550354844
+ },
+ "mldr_retrieval": {
+ "ndcg@10": 0.06529330431356167
},
"mrtydi": {
- "ndcg@10": 0.027590779174942116
+ "ndcg@10": 0.027849411947159904
+ },
+ "nlp_journal_abs_article": {
+ "ndcg@10": 0.24914118502751986
},
"nlp_journal_abs_intro": {
- "ndcg@10": 0.2848558252647936
+ "ndcg@10": 0.2554860092306942
},
"nlp_journal_title_abs": {
- "ndcg@10": 0.3646520309406354
+ "ndcg@10": 0.35835508156998896
},
"nlp_journal_title_intro": {
- "ndcg@10": 0.11545016260271045
+ "ndcg@10": 0.12133118349638791
}
},
"STS": {
"jsick": {
- "spearman": 0.7236409557069434
+ "spearman": 0.7238085290735078
},
"jsts": {
- "spearman": 0.7843597058304203
+ "spearman": 0.784483411606707
}
},
"Clustering": {
"livedoor_news": {
- "v_measure_score": 0.24487129939212224
+ "v_measure_score": 0.26373496762588294
},
"mewsc16": {
- "v_measure_score": 0.304278393205056
- }
- },
- "PairClassification": {
- "paws_x_ja": {
- "binary_f1": 0.6219686162624821
+ "v_measure_score": 0.32418419167915596
+ },
+ "sib200_japanese_clustering": {
+ "v_measure_score": 0.2434250739162938
}
}
}
\ No newline at end of file
diff --git a/leaderboard.md b/leaderboard.md
index dd64309..34d5d62 100644
--- a/leaderboard.md
+++ b/leaderboard.md
@@ -5,233 +5,254 @@ This leaderboard shows the results stored under `docs/results`. The scores are a
The summary shows the average scores within each task. The average score is the average of scores by dataset.
-| Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | PairClassification |
-|:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------|
-| sbintuitions/sarashina-embedding-v1-1b | **75.50** | **77.61** | 82.71 | **78.37** | **93.74** | 53.86 | 62.00 |
-| OpenAI/text-embedding-3-large | 74.05 | 74.48 | 82.52 | 77.58 | 93.58 | 53.32 | 62.35 |
-| jinaai/jina-embeddings-v3 | 73.44 | 75.22 | 80.05 | 76.39 | 92.71 | 51.46 | 62.37 |
-| cl-nagoya/ruri-large | 73.31 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 |
-| pkshatech/GLuCoSE-base-ja-v2 | 72.23 | 73.36 | 82.96 | 74.21 | 93.01 | 48.65 | 62.37 |
-| pkshatech/RoSEtta-base-ja | 72.04 | 73.21 | 81.39 | 72.41 | 92.69 | 53.23 | 61.74 |
-| cl-nagoya/ruri-base | 71.91 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 |
-| cl-nagoya/ruri-small | 71.53 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 |
-| intfloat/multilingual-e5-large | 70.90 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 |
-| OpenAI/text-embedding-3-small | 69.18 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 |
-| intfloat/multilingual-e5-base | 68.61 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 |
-| intfloat/multilingual-e5-small | 67.71 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 |
-| pkshatech/GLuCoSE-base-ja | 67.29 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** |
-| OpenAI/text-embedding-ada-002 | 67.21 | 64.38 | 79.02 | 69.75 | 93.04 | 48.30 | 62.40 |
-| cl-nagoya/sup-simcse-ja-base | 63.36 | 49.64 | 82.05 | 73.47 | 91.83 | 51.79 | 62.57 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 61.55 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 60.83 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 60.77 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 |
-| cl-nagoya/unsup-simcse-ja-large | 59.58 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 59.03 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 |
-| cl-nagoya/sup-simcse-ja-large | 58.88 | 37.62 | **83.18** | 73.73 | 91.48 | 50.56 | 62.51 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 58.77 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 58.66 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 |
-| cl-nagoya/unsup-simcse-ja-base | 58.39 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 |
-| sentence-transformers/LaBSE | 58.01 | 40.12 | 76.56 | 72.66 | 91.63 | 44.88 | 62.33 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 57.97 | 41.32 | 74.66 | 72.76 | 90.66 | 43.11 | 62.37 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 56.86 | 37.00 | 76.80 | 71.30 | 91.49 | 47.53 | 62.40 |
-| sentence-transformers/stsb-xlm-r-multilingual | 48.21 | 21.00 | 75.40 | 71.84 | 90.20 | 27.46 | 62.20 |
-| colorfulscoop/sbert-base-ja | 47.38 | 16.52 | 70.42 | 69.07 | 89.97 | 44.81 | 62.31 |
+| Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering |
+|:----------------------------------------------|:---------:|:-----------:|:---------:|:----------------:|:-----------:|:------------:|
+| sbintuitions/sarashina-embedding-v2-1b | **76.38** | **76.48** | **84.22** | 77.14 | **86.28** | 52.56 |
+| cl-nagoya/ruri-v3-310m | 75.85 | 76.03 | 81.59 | **77.65** | 85.84 | 50.52 |
+| cl-nagoya/ruri-v3-130m | 75.52 | 76.45 | 81.05 | 75.65 | 85.71 | 51.13 |
+| sbintuitions/sarashina-embedding-v1-1b | 74.87 | 74.53 | 81.71 | 77.20 | 84.36 | 50.30 |
+| cl-nagoya/ruri-v3-70m | 73.95 | 74.23 | 80.96 | 74.45 | 84.21 | 49.95 |
+| OpenAI/text-embedding-3-large | 73.86 | 71.95 | 82.52 | 77.27 | 83.06 | 51.82 |
+| cl-nagoya/ruri-large-v2 | 73.63 | 71.87 | 83.18 | 76.10 | 83.89 | 50.88 |
+| cl-nagoya/ruri-v3-30m | 72.95 | 72.84 | 81.78 | 73.35 | 82.93 | 49.90 |
+| cl-nagoya/ruri-large | 71.69 | 68.30 | 83.13 | 76.25 | 81.26 | 49.93 |
+| cl-nagoya/ruri-base-v2 | 71.66 | 68.96 | 83.03 | 75.59 | 82.46 | 46.84 |
+| cl-nagoya/ruri-small-v2 | 71.40 | 68.46 | 82.91 | 74.12 | 82.30 | 49.97 |
+| pkshatech/GLuCoSE-base-ja-v2 | 71.11 | 68.45 | 82.95 | 73.52 | 82.63 | 48.19 |
+| intfloat/multilingual-e5-large | 70.67 | 67.65 | 80.86 | 72.30 | 83.01 | 50.58 |
+| google/embeddinggemma-300m | 70.59 | 65.91 | 82.74 | 76.14 | 80.93 | 49.48 |
+| cl-nagoya/ruri-base | 70.25 | 65.90 | 82.88 | 75.34 | 80.31 | 49.10 |
+| pkshatech/RoSEtta-base-ja | 69.58 | 67.52 | 81.39 | 71.70 | 81.25 | 44.88 |
+| cl-nagoya/ruri-small | 69.34 | 63.95 | 82.79 | 74.83 | 79.98 | 49.59 |
+| intfloat/multilingual-e5-base | 68.06 | 64.48 | 80.46 | 69.70 | 79.46 | 50.12 |
+| intfloat/multilingual-e5-small | 67.38 | 63.91 | 80.46 | 67.77 | 80.09 | 49.29 |
+| OpenAI/text-embedding-3-small | 67.10 | 61.79 | 79.46 | 72.43 | 77.29 | 48.91 |
+| OpenAI/text-embedding-ada-002 | 65.13 | 59.58 | 79.02 | 69.39 | 75.63 | 48.78 |
+| hotchpotch/static-embedding-japanese | 63.80 | 60.51 | 80.16 | 66.73 | 77.09 | 35.91 |
+| pkshatech/GLuCoSE-base-ja | 63.79 | 54.58 | 78.68 | 75.02 | 72.37 | 47.12 |
+| cl-nagoya/sup-simcse-ja-base | 59.91 | 45.00 | 82.05 | 72.72 | 70.36 | **52.57** |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 57.60 | 42.41 | 79.00 | 71.83 | 71.88 | 42.02 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 56.75 | 38.08 | 76.56 | 74.53 | 69.81 | 48.75 |
+| cl-nagoya/sup-simcse-ja-large | 56.46 | 37.38 | 83.17 | 72.74 | 68.76 | 50.12 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 55.78 | 39.85 | 77.96 | 71.46 | 69.92 | 39.27 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 55.35 | 36.23 | 78.29 | 72.59 | 70.59 | 44.54 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 54.65 | 36.24 | 77.75 | 71.81 | 68.58 | 43.45 |
+| cl-nagoya/unsup-simcse-ja-large | 54.23 | 33.98 | 80.56 | 73.71 | 67.39 | 43.52 |
+| cl-nagoya/unsup-simcse-ja-base | 53.86 | 35.34 | 78.74 | 72.41 | 66.20 | 41.29 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 53.82 | 35.22 | 74.96 | 71.48 | 68.15 | 42.86 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 53.59 | 34.93 | 76.70 | 72.06 | 67.73 | 39.93 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 53.48 | 32.80 | 76.81 | 70.67 | 68.02 | 49.45 |
+| sentence-transformers/LaBSE | 52.70 | 33.18 | 76.56 | 71.85 | 67.01 | 39.82 |
+| sentence-transformers/stsb-xlm-r-multilingual | 43.06 | 16.58 | 75.41 | 71.40 | 57.93 | 27.67 |
+| colorfulscoop/sbert-base-ja | 42.90 | 15.45 | 70.41 | 68.05 | 59.38 | 39.04 |
## Retrieval
-| Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) |
-|:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------|
-| sbintuitions/sarashina-embedding-v1-1b | **77.61** | 71.68 | **72.79** | 41.95 | 93.94 | 96.96 | 88.33 |
-| jinaai/jina-embeddings-v3 | 75.22 | 71.50 | 46.48 | **45.45** | 98.43 | 95.62 | 93.85 |
-| OpenAI/text-embedding-3-large | 74.48 | 72.41 | 48.21 | 34.88 | **99.33** | 96.55 | **95.47** |
-| pkshatech/GLuCoSE-base-ja-v2 | 73.36 | 69.79 | 67.29 | 41.86 | 90.29 | 95.11 | 75.80 |
-| pkshatech/RoSEtta-base-ja | 73.21 | 65.96 | 65.33 | 36.73 | 95.54 | 94.08 | 81.63 |
-| cl-nagoya/ruri-large | 73.02 | **76.68** | 61.74 | 38.03 | 87.12 | 96.58 | 77.97 |
-| intfloat/multilingual-e5-large | 70.98 | 70.30 | 58.78 | 43.63 | 86.00 | 94.70 | 72.48 |
-| cl-nagoya/ruri-base | 69.82 | 74.56 | 50.12 | 35.45 | 86.89 | 96.57 | 75.31 |
-| cl-nagoya/ruri-small | 69.41 | 73.65 | 48.44 | 33.43 | 87.69 | **97.17** | 76.09 |
-| intfloat/multilingual-e5-base | 68.21 | 65.34 | 50.67 | 38.38 | 87.10 | 94.73 | 73.05 |
-| intfloat/multilingual-e5-small | 67.27 | 64.11 | 49.97 | 36.05 | 85.21 | 95.26 | 72.99 |
-| OpenAI/text-embedding-3-small | 66.39 | 64.02 | 33.94 | 20.03 | 98.47 | 91.70 | 90.17 |
-| OpenAI/text-embedding-ada-002 | 64.38 | 61.02 | 42.56 | 14.51 | 94.99 | 91.23 | 81.98 |
-| pkshatech/GLuCoSE-base-ja | 59.02 | 63.88 | 39.82 | 30.28 | 78.26 | 82.06 | 59.82 |
-| cl-nagoya/sup-simcse-ja-base | 49.64 | 51.62 | 50.25 | 13.98 | 68.08 | 65.71 | 48.22 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 47.38 | 50.14 | 45.84 | 13.00 | 55.09 | 74.97 | 45.24 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.36 | 47.39 | 39.57 | 11.44 | 64.16 | 70.23 | 45.37 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 43.00 | 51.99 | 42.07 | 10.12 | 49.30 | 71.94 | 32.59 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 42.59 | 42.37 | 37.72 | 7.88 | 63.70 | 64.13 | 39.73 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 41.79 | 42.31 | 36.20 | 7.81 | 60.77 | 64.34 | 39.32 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 41.32 | 44.11 | 39.61 | 8.15 | 62.76 | 58.39 | 34.89 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 40.82 | 47.04 | 44.10 | 11.43 | 43.43 | 62.41 | 36.52 |
-| cl-nagoya/unsup-simcse-ja-large | 40.53 | 45.09 | 34.60 | 5.75 | 55.07 | 63.07 | 39.61 |
-| cl-nagoya/unsup-simcse-ja-base | 40.23 | 46.00 | 39.46 | 5.55 | 60.26 | 55.63 | 34.49 |
-| sentence-transformers/LaBSE | 40.12 | 43.10 | 34.25 | 4.24 | 48.92 | 75.13 | 35.09 |
-| cl-nagoya/sup-simcse-ja-large | 37.62 | 46.84 | 39.88 | 11.83 | 63.43 | 37.93 | 25.79 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 37.00 | 41.50 | 46.00 | 10.19 | 40.14 | 59.63 | 24.53 |
-| sentence-transformers/stsb-xlm-r-multilingual | 21.00 | 25.11 | 21.61 | 2.76 | 28.49 | 36.47 | 11.55 |
-| colorfulscoop/sbert-base-ja | 16.52 | 21.50 | 13.16 | 0.44 | 28.78 | 22.40 | 12.82 |
+| Model | Avg. | jacwir_retrieval
(ndcg@10) | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mintaka_retrieval
(ndcg@10) | miracl_retrieval
(ndcg@10) | mldr_retrieval
(ndcg@10) | mrtydi
(ndcg@10) | nlp_abs_article
(ndcg@10) | nlp_abs_intro
(ndcg@10) | nlp_title_abs
(ndcg@10) | nlp_title_intro
(ndcg@10) |
+|:----------------------------------------------|:---------:|:-------------------------------:|:----------------------------:|:---------------------:|:--------------------------------:|:-------------------------------:|:-----------------------------:|:---------------------:|:------------------------------:|:----------------------------:|:----------------------------:|:------------------------------:|
+| sbintuitions/sarashina-embedding-v2-1b | **76.48** | 85.54 | 74.87 | **73.52** | **66.11** | 68.26 | 40.35 | **49.57** | 96.84 | 96.28 | **98.11** | 91.79 |
+| cl-nagoya/ruri-v3-130m | 76.45 | 84.21 | 75.32 | 73.10 | 51.77 | **71.01** | 45.16 | 47.80 | 99.51 | 98.88 | 97.95 | 96.28 |
+| cl-nagoya/ruri-v3-310m | 76.03 | 84.06 | 76.49 | 71.87 | 52.25 | 67.71 | 43.43 | 47.06 | **99.59** | **99.35** | 97.91 | **96.58** |
+| sbintuitions/sarashina-embedding-v1-1b | 74.53 | 82.43 | 71.76 | 72.92 | 62.60 | 63.23 | 34.59 | 40.75 | 99.20 | 99.16 | 96.85 | 96.29 |
+| cl-nagoya/ruri-v3-70m | 74.23 | 82.76 | 73.27 | 67.68 | 46.26 | 67.98 | 43.55 | 45.00 | 98.50 | 98.68 | 97.07 | 95.73 |
+| cl-nagoya/ruri-v3-30m | 72.84 | 82.70 | 70.21 | 62.45 | 43.05 | 64.99 | **45.77** | 41.78 | 98.76 | 99.16 | 96.99 | 95.34 |
+| OpenAI/text-embedding-3-large | 71.95 | 82.90 | 72.41 | 48.21 | 63.52 | 60.57 | 45.26 | 34.88 | 92.37 | 99.33 | 96.55 | 95.47 |
+| cl-nagoya/ruri-large-v2 | 71.87 | 80.49 | **78.23** | 65.61 | 50.41 | 70.46 | 36.97 | 46.37 | 90.85 | 91.15 | 97.74 | 82.32 |
+| cl-nagoya/ruri-base-v2 | 68.96 | 81.01 | 75.90 | 57.01 | 44.18 | 68.22 | 37.73 | 40.89 | 88.05 | 89.73 | 96.96 | 78.93 |
+| cl-nagoya/ruri-small-v2 | 68.46 | 83.04 | 74.02 | 62.25 | 35.31 | 66.90 | 32.58 | 42.40 | 90.65 | 90.42 | 97.30 | 78.21 |
+| pkshatech/GLuCoSE-base-ja-v2 | 68.45 | 83.85 | 69.85 | 67.52 | 39.57 | 65.29 | 33.75 | 41.67 | 89.91 | 90.08 | 95.67 | 75.79 |
+| cl-nagoya/ruri-large | 68.30 | 81.69 | 77.64 | 61.73 | 51.06 | 55.47 | 34.77 | 38.12 | 86.53 | 88.91 | 96.17 | 79.22 |
+| intfloat/multilingual-e5-large | 67.65 | **86.41** | 72.98 | 59.67 | 39.59 | 70.96 | 29.85 | 47.82 | 83.26 | 85.71 | 95.29 | 72.57 |
+| pkshatech/RoSEtta-base-ja | 67.52 | 82.02 | 66.28 | 64.28 | 34.04 | 60.16 | 32.37 | 36.77 | 96.04 | 95.41 | 93.17 | 82.19 |
+| google/embeddinggemma-300m | 65.91 | 81.07 | 69.43 | 63.27 | 38.63 | 35.28 | 34.66 | 13.86 | 99.34 | 99.02 | 96.12 | 94.35 |
+| cl-nagoya/ruri-base | 65.90 | 82.48 | 75.50 | 50.23 | 45.37 | 54.88 | 35.42 | 35.59 | 86.65 | 87.23 | 95.27 | 76.25 |
+| intfloat/multilingual-e5-base | 64.48 | 84.32 | 68.72 | 51.69 | 34.68 | 64.50 | 25.73 | 42.30 | 83.56 | 84.48 | 94.62 | 74.70 |
+| cl-nagoya/ruri-small | 63.95 | 82.58 | 74.01 | 48.44 | 37.23 | 52.22 | 28.99 | 33.51 | 86.89 | 87.23 | 96.20 | 76.09 |
+| intfloat/multilingual-e5-small | 63.91 | 85.58 | 65.69 | 51.57 | 31.54 | 63.23 | 25.91 | 42.37 | 83.97 | 84.10 | 94.47 | 74.56 |
+| OpenAI/text-embedding-3-small | 61.79 | 79.58 | 64.02 | 33.94 | 32.44 | 48.45 | 35.07 | 20.03 | 85.83 | 98.47 | 91.70 | 90.17 |
+| hotchpotch/static-embedding-japanese | 60.51 | 72.27 | 55.55 | 64.04 | 38.93 | 32.61 | 42.51 | 11.18 | 76.19 | 95.74 | 90.37 | 86.25 |
+| OpenAI/text-embedding-ada-002 | 59.58 | 78.08 | 61.02 | 42.56 | 27.09 | 34.54 | 31.90 | 14.51 | 97.51 | 94.99 | 91.23 | 81.98 |
+| pkshatech/GLuCoSE-base-ja | 54.58 | 69.30 | 64.14 | 39.78 | 29.81 | 48.27 | 25.07 | 30.14 | 76.78 | 77.21 | 81.40 | 58.43 |
+| cl-nagoya/sup-simcse-ja-base | 45.00 | 53.32 | 52.02 | 50.13 | 32.88 | 20.68 | 24.70 | 14.14 | 69.09 | 66.19 | 64.84 | 46.97 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 42.41 | 53.16 | 51.20 | 45.81 | 30.42 | 26.08 | 23.65 | 13.06 | 54.65 | 52.13 | 74.13 | 42.21 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 39.85 | 48.95 | 48.41 | 38.73 | 25.72 | 21.60 | 18.11 | 11.02 | 58.91 | 60.05 | 69.15 | 37.72 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 38.08 | 59.65 | 54.07 | 40.22 | 24.83 | 17.19 | 19.08 | 10.09 | 44.07 | 44.84 | 73.68 | 31.15 |
+| cl-nagoya/sup-simcse-ja-large | 37.38 | 43.71 | 47.42 | 40.04 | 37.68 | 18.13 | 23.48 | 11.88 | 64.08 | 62.95 | 36.95 | 24.90 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 36.24 | 42.43 | 43.60 | 37.35 | 25.18 | 14.76 | 16.86 | 7.77 | 56.89 | 59.11 | 61.81 | 32.88 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 36.23 | 46.28 | 48.25 | 44.17 | 28.89 | 19.52 | 18.66 | 11.44 | 43.97 | 40.33 | 60.49 | 36.51 |
+| cl-nagoya/unsup-simcse-ja-base | 35.34 | 35.11 | 46.74 | 39.52 | 29.92 | 10.93 | 15.98 | 5.51 | 58.22 | 58.41 | 55.58 | 32.84 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 35.22 | 39.92 | 44.60 | 38.45 | 22.39 | 13.94 | 13.91 | 7.30 | 58.35 | 58.63 | 57.43 | 32.47 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 34.93 | 40.86 | 43.88 | 35.56 | 19.98 | 16.52 | 12.06 | 7.11 | 54.30 | 55.86 | 62.96 | 35.17 |
+| cl-nagoya/unsup-simcse-ja-large | 33.98 | 37.61 | 46.56 | 34.53 | 30.58 | 10.33 | 12.55 | 5.75 | 50.45 | 50.70 | 60.43 | 34.32 |
+| sentence-transformers/LaBSE | 33.18 | 49.12 | 42.43 | 24.92 | 20.02 | 9.36 | 7.53 | 4.22 | 48.06 | 48.20 | 75.59 | 35.53 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 32.80 | 45.03 | 41.00 | 37.01 | 31.30 | 16.07 | 20.08 | 10.15 | 38.13 | 37.60 | 59.18 | 25.26 |
+| sentence-transformers/stsb-xlm-r-multilingual | 16.58 | 21.08 | 22.49 | 6.49 | 22.31 | 2.28 | 6.53 | 2.78 | 24.91 | 25.55 | 35.84 | 12.13 |
+| colorfulscoop/sbert-base-ja | 15.45 | 19.30 | 21.70 | 13.14 | 19.07 | 1.86 | 6.97 | 0.41 | 29.02 | 25.80 | 21.07 | 11.57 |
## STS
-| Model | Avg. | jsick
(spearman) | jsts
(spearman) |
-|:----------------------------------------------|:----------|:----------------------|:---------------------|
-| cl-nagoya/sup-simcse-ja-large | **83.18** | 83.80 | 82.57 |
-| cl-nagoya/ruri-large | 83.13 | 82.00 | 84.26 |
-| pkshatech/GLuCoSE-base-ja-v2 | 82.96 | **84.96** | 80.96 |
-| cl-nagoya/ruri-base | 82.87 | 82.32 | 83.43 |
-| cl-nagoya/ruri-small | 82.79 | 83.44 | 82.13 |
-| sbintuitions/sarashina-embedding-v1-1b | 82.71 | 80.22 | **85.20** |
-| OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 |
-| cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 |
-| pkshatech/RoSEtta-base-ja | 81.39 | 83.83 | 78.95 |
-| cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 |
-| intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 |
-| jinaai/jina-embeddings-v3 | 80.05 | 78.16 | 81.93 |
-| intfloat/multilingual-e5-base | 79.84 | 81.28 | 78.39 |
-| intfloat/multilingual-e5-large | 79.70 | 78.40 | 80.99 |
-| OpenAI/text-embedding-3-small | 79.46 | 80.83 | 78.08 |
-| OpenAI/text-embedding-ada-002 | 79.02 | 79.09 | 78.94 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 78.99 | 79.84 | 78.14 |
-| cl-nagoya/unsup-simcse-ja-base | 78.72 | 78.49 | 78.95 |
-| pkshatech/GLuCoSE-base-ja | 78.71 | 74.97 | 82.46 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 78.28 | 78.75 | 77.81 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 77.49 | 78.18 | 76.81 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 77.05 | 77.57 | 76.53 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 76.80 | 73.08 | 80.52 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.60 | 72.11 | 81.09 |
-| sentence-transformers/LaBSE | 76.56 | 76.99 | 76.12 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 75.50 | 75.42 | 75.58 |
-| sentence-transformers/stsb-xlm-r-multilingual | 75.40 | 72.36 | 78.44 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 74.66 | 74.64 | 74.68 |
-| colorfulscoop/sbert-base-ja | 70.42 | 66.59 | 74.24 |
+| Model | Avg. | jsick
(spearman) | jsts
(spearman) |
+|:----------------------------------------------|:---------:|:---------------------:|:--------------------:|
+| sbintuitions/sarashina-embedding-v2-1b | **84.22** | 82.58 | **85.87** |
+| cl-nagoya/ruri-large-v2 | 83.18 | 82.12 | 84.24 |
+| cl-nagoya/sup-simcse-ja-large | 83.17 | 83.78 | 82.56 |
+| cl-nagoya/ruri-large | 83.13 | 82.00 | 84.26 |
+| cl-nagoya/ruri-base-v2 | 83.03 | 82.63 | 83.43 |
+| pkshatech/GLuCoSE-base-ja-v2 | 82.95 | **84.95** | 80.96 |
+| cl-nagoya/ruri-small-v2 | 82.91 | 83.88 | 81.93 |
+| cl-nagoya/ruri-base | 82.88 | 82.32 | 83.43 |
+| cl-nagoya/ruri-small | 82.79 | 83.45 | 82.13 |
+| google/embeddinggemma-300m | 82.74 | 81.67 | 83.81 |
+| OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 |
+| cl-nagoya/sup-simcse-ja-base | 82.05 | 82.84 | 81.26 |
+| cl-nagoya/ruri-v3-30m | 81.78 | 81.62 | 81.95 |
+| sbintuitions/sarashina-embedding-v1-1b | 81.71 | 79.79 | 83.63 |
+| cl-nagoya/ruri-v3-310m | 81.59 | 78.86 | 84.31 |
+| pkshatech/RoSEtta-base-ja | 81.39 | 83.83 | 78.95 |
+| cl-nagoya/ruri-v3-130m | 81.05 | 78.86 | 83.24 |
+| cl-nagoya/ruri-v3-70m | 80.96 | 79.10 | 82.82 |
+| intfloat/multilingual-e5-large | 80.86 | 79.85 | 81.86 |
+| cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.14 | 80.98 |
+| intfloat/multilingual-e5-small | 80.46 | 82.00 | 78.92 |
+| intfloat/multilingual-e5-base | 80.46 | 81.26 | 79.65 |
+| hotchpotch/static-embedding-japanese | 80.16 | 82.51 | 77.81 |
+| OpenAI/text-embedding-3-small | 79.46 | 80.83 | 78.08 |
+| OpenAI/text-embedding-ada-002 | 79.02 | 79.09 | 78.94 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 79.00 | 79.86 | 78.14 |
+| cl-nagoya/unsup-simcse-ja-base | 78.74 | 78.53 | 78.94 |
+| pkshatech/GLuCoSE-base-ja | 78.68 | 74.90 | 82.46 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 78.29 | 78.76 | 77.82 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 77.96 | 79.14 | 76.77 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 77.75 | 78.93 | 76.57 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 76.81 | 73.11 | 80.51 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 76.70 | 77.76 | 75.63 |
+| sentence-transformers/LaBSE | 76.56 | 77.01 | 76.12 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.56 | 72.04 | 81.08 |
+| sentence-transformers/stsb-xlm-r-multilingual | 75.41 | 72.38 | 78.45 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 74.96 | 75.25 | 74.66 |
+| colorfulscoop/sbert-base-ja | 70.41 | 66.56 | 74.25 |
## Classification
-| Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) |
-|:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------|
-| sbintuitions/sarashina-embedding-v1-1b | **78.37** | 79.10 | **61.48** | 82.26 | 90.65 |
-| OpenAI/text-embedding-3-large | 77.58 | 77.90 | 60.44 | 80.91 | **91.08** |
-| cl-nagoya/ruri-large | 77.43 | 80.81 | 56.80 | **82.56** | 89.56 |
-| pkshatech/GLuCoSE-base-ja | 76.82 | **82.44** | 58.07 | 78.85 | 87.94 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 |
-| jinaai/jina-embeddings-v3 | 76.39 | 78.83 | 59.33 | 77.65 | 89.74 |
-| cl-nagoya/ruri-small | 76.22 | 79.92 | 55.61 | 81.49 | 87.88 |
-| cl-nagoya/ruri-base | 75.58 | 76.66 | 55.76 | 81.41 | 88.49 |
-| cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 |
-| pkshatech/GLuCoSE-base-ja-v2 | 74.21 | 74.92 | 55.31 | 79.79 | 86.84 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 |
-| cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 73.47 | 77.25 | 53.42 | 76.83 | 86.39 |
-| cl-nagoya/sup-simcse-ja-base | 73.47 | 72.34 | 54.41 | 79.52 | 87.60 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 73.30 | 76.20 | 51.52 | 78.95 | 86.54 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 73.13 | 76.36 | 52.75 | 76.88 | 86.51 |
-| cl-nagoya/unsup-simcse-ja-base | 73.07 | 73.30 | 53.93 | 79.07 | 85.97 |
-| OpenAI/text-embedding-3-small | 73.06 | 70.01 | 55.92 | 77.66 | 88.67 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 72.90 | 77.62 | 50.86 | 77.19 | 85.93 |
-| intfloat/multilingual-e5-large | 72.89 | 70.66 | 56.54 | 75.78 | 88.59 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 72.76 | 76.20 | 52.06 | 77.89 | 84.90 |
-| sentence-transformers/LaBSE | 72.66 | 73.61 | 51.70 | 76.99 | 88.35 |
-| pkshatech/RoSEtta-base-ja | 72.41 | 70.05 | 52.64 | 79.84 | 87.10 |
-| sentence-transformers/stsb-xlm-r-multilingual | 71.84 | 75.65 | 51.32 | 74.28 | 86.10 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 71.30 | 67.49 | 50.85 | 79.67 | 87.20 |
-| OpenAI/text-embedding-ada-002 | 69.75 | 64.42 | 53.13 | 74.57 | 86.89 |
-| intfloat/multilingual-e5-base | 69.30 | 63.67 | 54.24 | 72.78 | 86.53 |
-| colorfulscoop/sbert-base-ja | 69.07 | 72.21 | 47.95 | 72.52 | 83.62 |
-| intfloat/multilingual-e5-small | 67.62 | 62.14 | 51.27 | 70.85 | 86.22 |
+| Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | jpn_sentiment
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) | sib200_jpn_cls
(macro_f1) | wrime_classification
(macro_f1) |
+|:----------------------------------------------|:---------:|:-------------------------------------:|:-----------------------------:|:-----------------------------:|:------------------------------:|:--------------------------------:|:------------------------------:|:------------------------------------:|
+| cl-nagoya/ruri-v3-310m | **77.65** | 80.09 | 60.72 | 95.31 | 81.76 | 89.01 | 88.13 | 48.53 |
+| OpenAI/text-embedding-3-large | 77.27 | 77.90 | 60.44 | **96.89** | 80.91 | **91.08** | 87.85 | 45.84 |
+| sbintuitions/sarashina-embedding-v1-1b | 77.20 | 79.66 | **62.02** | 95.03 | 81.21 | 90.16 | 82.63 | 49.70 |
+| sbintuitions/sarashina-embedding-v2-1b | 77.14 | 79.81 | 61.39 | 93.51 | **83.69** | 90.23 | 81.48 | **49.87** |
+| cl-nagoya/ruri-large | 76.25 | 79.50 | 56.85 | 93.56 | 82.10 | 90.03 | 85.26 | 46.45 |
+| google/embeddinggemma-300m | 76.14 | 74.74 | 58.04 | 95.99 | 80.07 | 90.58 | 86.92 | 46.62 |
+| cl-nagoya/ruri-large-v2 | 76.10 | 79.51 | 57.09 | 93.57 | 80.87 | 89.71 | 84.72 | 47.23 |
+| cl-nagoya/ruri-v3-130m | 75.65 | 76.75 | 59.56 | 95.00 | 80.79 | 87.90 | 82.88 | 46.63 |
+| cl-nagoya/ruri-base-v2 | 75.59 | 75.97 | 55.55 | 92.36 | 80.93 | 88.87 | **89.26** | 46.17 |
+| cl-nagoya/ruri-base | 75.34 | 76.66 | 56.02 | 91.69 | 81.22 | 88.61 | 87.73 | 45.47 |
+| pkshatech/GLuCoSE-base-ja | 75.02 | **82.03** | 57.93 | 92.89 | 78.52 | 87.71 | 77.24 | 48.82 |
+| cl-nagoya/ruri-small | 74.83 | 80.55 | 55.41 | 88.86 | 81.08 | 88.00 | 83.97 | 45.95 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 74.53 | 79.72 | 58.02 | 91.99 | 80.16 | 88.78 | 77.31 | 45.73 |
+| cl-nagoya/ruri-v3-70m | 74.45 | 81.81 | 57.98 | 93.39 | 78.92 | 87.83 | 76.87 | 44.38 |
+| cl-nagoya/ruri-small-v2 | 74.12 | 77.67 | 55.60 | 88.64 | 82.00 | 88.16 | 81.57 | 45.23 |
+| cl-nagoya/unsup-simcse-ja-large | 73.71 | 76.40 | 55.05 | 90.57 | 79.25 | 87.50 | 82.89 | 44.33 |
+| pkshatech/GLuCoSE-base-ja-v2 | 73.52 | 75.28 | 55.19 | 89.24 | 78.73 | 87.14 | 85.83 | 43.23 |
+| cl-nagoya/ruri-v3-30m | 73.35 | 75.60 | 55.71 | 92.63 | 78.31 | 86.72 | 81.40 | 43.11 |
+| cl-nagoya/sup-simcse-ja-large | 72.74 | 72.61 | 54.56 | 89.42 | 79.23 | 87.71 | 80.43 | 45.26 |
+| cl-nagoya/sup-simcse-ja-base | 72.72 | 71.93 | 54.54 | 91.01 | 80.11 | 87.63 | 81.92 | 41.88 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 72.59 | 77.67 | 53.48 | 89.28 | 76.79 | 86.24 | 83.77 | 40.89 |
+| OpenAI/text-embedding-3-small | 72.43 | 70.01 | 55.92 | 89.97 | 77.66 | 88.67 | 84.72 | 40.05 |
+| cl-nagoya/unsup-simcse-ja-base | 72.41 | 73.65 | 54.14 | 89.87 | 77.68 | 86.10 | 84.13 | 41.31 |
+| intfloat/multilingual-e5-large | 72.30 | 69.70 | 57.64 | 95.55 | 74.01 | 88.71 | 78.11 | 42.38 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 72.06 | 77.70 | 51.46 | 88.45 | 78.72 | 86.40 | 83.50 | 38.15 |
+| sentence-transformers/LaBSE | 71.85 | 74.74 | 51.63 | 89.52 | 77.09 | 88.39 | 81.47 | 40.11 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 71.83 | 76.55 | 52.73 | 88.22 | 77.22 | 86.25 | 81.45 | 40.38 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 71.81 | 77.79 | 51.11 | 87.82 | 77.97 | 86.34 | 85.06 | 36.56 |
+| pkshatech/RoSEtta-base-ja | 71.70 | 70.21 | 52.62 | 87.28 | 79.59 | 86.96 | 84.01 | 41.24 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 71.48 | 74.30 | 51.97 | 89.69 | 77.83 | 84.60 | 83.82 | 38.15 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 71.46 | 76.40 | 51.65 | 87.86 | 78.15 | 86.44 | 81.80 | 37.93 |
+| sentence-transformers/stsb-xlm-r-multilingual | 71.40 | 75.14 | 51.67 | 87.15 | 74.34 | 86.07 | 83.73 | 41.68 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 70.67 | 68.28 | 51.75 | 88.21 | 79.65 | 87.23 | 81.18 | 38.39 |
+| intfloat/multilingual-e5-base | 69.70 | 64.29 | 54.17 | 92.32 | 73.19 | 86.78 | 78.50 | 38.65 |
+| OpenAI/text-embedding-ada-002 | 69.39 | 64.42 | 53.13 | 88.76 | 74.57 | 86.89 | 80.39 | 37.57 |
+| colorfulscoop/sbert-base-ja | 68.05 | 70.80 | 47.80 | 83.50 | 72.89 | 83.71 | 82.63 | 35.06 |
+| intfloat/multilingual-e5-small | 67.77 | 58.66 | 51.21 | 87.73 | 71.34 | 86.77 | 81.78 | 36.91 |
+| hotchpotch/static-embedding-japanese | 66.73 | 68.06 | 46.81 | 79.82 | 74.79 | 82.18 | 83.33 | 32.12 |
## Reranking
-| Model | Avg. | esci
(ndcg@10) |
-|:----------------------------------------------|:----------|:--------------------|
-| sbintuitions/sarashina-embedding-v1-1b | **93.74** | **93.74** |
-| OpenAI/text-embedding-3-large | 93.58 | 93.58 |
-| OpenAI/text-embedding-ada-002 | 93.04 | 93.04 |
-| intfloat/multilingual-e5-small | 93.03 | 93.03 |
-| pkshatech/GLuCoSE-base-ja-v2 | 93.01 | 93.01 |
-| cl-nagoya/ruri-small | 93.00 | 93.00 |
-| cl-nagoya/ruri-large | 92.99 | 92.99 |
-| intfloat/multilingual-e5-large | 92.96 | 92.96 |
-| OpenAI/text-embedding-3-small | 92.92 | 92.92 |
-| cl-nagoya/ruri-base | 92.91 | 92.91 |
-| intfloat/multilingual-e5-base | 92.85 | 92.85 |
-| jinaai/jina-embeddings-v3 | 92.71 | 92.71 |
-| pkshatech/RoSEtta-base-ja | 92.69 | 92.69 |
-| pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 |
-| cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 |
-| sentence-transformers/LaBSE | 91.63 | 91.63 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 91.56 | 91.56 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 91.49 | 91.49 |
-| cl-nagoya/sup-simcse-ja-large | 91.48 | 91.48 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 91.30 | 91.30 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 91.16 | 91.16 |
-| cl-nagoya/unsup-simcse-ja-base | 91.16 | 91.16 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 91.01 | 91.01 |
-| cl-nagoya/unsup-simcse-ja-large | 90.95 | 90.95 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 90.95 | 90.95 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 90.95 | 90.95 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 90.66 | 90.66 |
-| sentence-transformers/stsb-xlm-r-multilingual | 90.20 | 90.20 |
-| colorfulscoop/sbert-base-ja | 89.97 | 89.97 |
+| Model | Avg. | esci
(ndcg@10) | jacwir_reranking
(ndcg@10) | jqara
(ndcg@10) | miracl_reranking
(ndcg@10) | mldr_reranking
(ndcg@10) |
+|:----------------------------------------------|:---------:|:-------------------:|:-------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------:|
+| sbintuitions/sarashina-embedding-v2-1b | **86.28** | 93.58 | 88.79 | **70.55** | 85.93 | 92.53 |
+| cl-nagoya/ruri-v3-310m | 85.84 | 93.43 | 88.46 | 68.93 | 85.01 | 93.36 |
+| cl-nagoya/ruri-v3-130m | 85.71 | 93.37 | 88.65 | 66.30 | **86.59** | 93.62 |
+| sbintuitions/sarashina-embedding-v1-1b | 84.36 | **93.60** | 86.85 | 65.92 | 85.17 | 90.24 |
+| cl-nagoya/ruri-v3-70m | 84.21 | 93.20 | 87.48 | 63.09 | 85.03 | 92.26 |
+| cl-nagoya/ruri-large-v2 | 83.89 | 93.21 | 85.29 | 64.47 | 85.78 | 90.68 |
+| OpenAI/text-embedding-3-large | 83.06 | 93.58 | 86.78 | 56.89 | 83.80 | **94.24** |
+| intfloat/multilingual-e5-large | 83.01 | 93.31 | **90.37** | 56.14 | 86.31 | 88.91 |
+| cl-nagoya/ruri-v3-30m | 82.93 | 93.06 | 87.61 | 57.47 | 83.52 | 92.97 |
+| pkshatech/GLuCoSE-base-ja-v2 | 82.63 | 93.02 | 88.27 | 60.70 | 82.44 | 88.71 |
+| cl-nagoya/ruri-base-v2 | 82.46 | 93.17 | 85.76 | 60.66 | 84.26 | 88.47 |
+| cl-nagoya/ruri-small-v2 | 82.30 | 93.20 | 88.18 | 56.70 | 83.33 | 90.09 |
+| cl-nagoya/ruri-large | 81.26 | 92.99 | 86.61 | 59.59 | 80.23 | 86.91 |
+| pkshatech/RoSEtta-base-ja | 81.25 | 92.68 | 86.83 | 57.92 | 80.38 | 88.45 |
+| google/embeddinggemma-300m | 80.93 | 93.26 | 86.72 | 52.09 | 82.38 | 90.19 |
+| cl-nagoya/ruri-base | 80.31 | 92.92 | 87.24 | 54.15 | 79.22 | 88.01 |
+| intfloat/multilingual-e5-small | 80.09 | 92.98 | 89.99 | 49.28 | 81.78 | 86.41 |
+| cl-nagoya/ruri-small | 79.98 | 93.01 | 87.67 | 53.26 | 77.84 | 88.14 |
+| intfloat/multilingual-e5-base | 79.46 | 92.90 | 88.65 | 47.61 | 81.97 | 86.15 |
+| OpenAI/text-embedding-3-small | 77.29 | 92.92 | 84.72 | 38.58 | 77.61 | 92.61 |
+| hotchpotch/static-embedding-japanese | 77.09 | 91.87 | 80.96 | 47.06 | 72.01 | 93.55 |
+| OpenAI/text-embedding-ada-002 | 75.63 | 93.04 | 83.91 | 37.54 | 72.83 | 90.83 |
+| pkshatech/GLuCoSE-base-ja | 72.37 | 91.82 | 74.54 | 30.24 | 77.82 | 87.42 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 71.88 | 91.30 | 65.14 | 44.96 | 71.21 | 86.79 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 70.59 | 90.93 | 61.45 | 42.47 | 70.65 | 87.42 |
+| cl-nagoya/sup-simcse-ja-base | 70.36 | 91.84 | 64.27 | 37.48 | 70.88 | 87.34 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 69.92 | 91.17 | 65.41 | 38.39 | 70.19 | 84.42 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 69.81 | 91.51 | 67.45 | 36.04 | 68.68 | 85.38 |
+| cl-nagoya/sup-simcse-ja-large | 68.76 | 91.50 | 56.15 | 38.30 | 71.26 | 86.60 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 68.58 | 90.92 | 60.51 | 36.84 | 69.31 | 85.31 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 68.15 | 90.67 | 58.16 | 36.66 | 69.09 | 86.15 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 68.02 | 91.27 | 57.45 | 31.74 | 72.12 | 87.50 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 67.73 | 90.95 | 59.81 | 37.20 | 67.90 | 82.81 |
+| cl-nagoya/unsup-simcse-ja-large | 67.39 | 90.95 | 54.17 | 38.78 | 70.02 | 83.04 |
+| sentence-transformers/LaBSE | 67.01 | 91.47 | 67.85 | 24.62 | 69.28 | 81.84 |
+| cl-nagoya/unsup-simcse-ja-base | 66.20 | 91.18 | 51.54 | 32.19 | 69.96 | 86.12 |
+| colorfulscoop/sbert-base-ja | 59.38 | 89.97 | 37.15 | 22.21 | 65.03 | 82.55 |
+| sentence-transformers/stsb-xlm-r-multilingual | 57.93 | 89.72 | 39.21 | 18.51 | 65.36 | 76.88 |
## Clustering
-| Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) |
-|:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------|
-| cl-nagoya/ruri-base | **54.16** | 54.27 | **54.04** |
-| sbintuitions/sarashina-embedding-v1-1b | 53.86 | 56.42 | 51.29 |
-| OpenAI/text-embedding-3-large | 53.32 | 57.09 | 49.55 |
-| pkshatech/RoSEtta-base-ja | 53.23 | **58.62** | 47.85 |
-| cl-nagoya/ruri-large | 51.82 | 51.39 | 52.25 |
-| cl-nagoya/sup-simcse-ja-base | 51.79 | 52.67 | 50.91 |
-| jinaai/jina-embeddings-v3 | 51.46 | 54.72 | 48.19 |
-| intfloat/multilingual-e5-large | 51.24 | 57.13 | 45.34 |
-| cl-nagoya/ruri-small | 51.19 | 50.96 | 51.41 |
-| OpenAI/text-embedding-3-small | 51.06 | 54.57 | 47.55 |
-| cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | 53.89 |
-| pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 |
-| pkshatech/GLuCoSE-base-ja-v2 | 48.65 | 51.52 | 45.78 |
-| cl-nagoya/unsup-simcse-ja-large | 48.41 | 50.90 | 45.92 |
-| OpenAI/text-embedding-ada-002 | 48.30 | 49.67 | 46.92 |
-| intfloat/multilingual-e5-base | 48.26 | 55.03 | 41.49 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 48.25 | 53.20 | 43.31 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 47.53 | 44.77 | 50.30 |
-| intfloat/multilingual-e5-small | 46.91 | 54.70 | 39.12 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.68 | 53.02 | 40.35 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 45.81 | 48.45 | 43.17 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 44.95 | 52.62 | 37.28 |
-| sentence-transformers/LaBSE | 44.88 | 48.29 | 41.47 |
-| colorfulscoop/sbert-base-ja | 44.81 | 42.99 | 46.64 |
-| cl-nagoya/unsup-simcse-ja-base | 44.77 | 52.23 | 37.31 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 44.22 | 49.67 | 38.77 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 43.11 | 41.04 | 45.18 |
-| sentence-transformers/stsb-xlm-r-multilingual | 27.46 | 24.49 | 30.43 |
-
-## PairClassification
-| Model | Avg. | paws_x_ja
(binary_f1) |
-|:----------------------------------------------|:----------|:---------------------------|
-| pkshatech/GLuCoSE-base-ja | **66.39** | **66.39** |
-| cl-nagoya/sup-simcse-ja-base | 62.57 | 62.57 |
-| cl-nagoya/sup-simcse-ja-large | 62.51 | 62.51 |
-| cl-nagoya/unsup-simcse-ja-large | 62.49 | 62.49 |
-| cl-nagoya/unsup-simcse-ja-base | 62.44 | 62.44 |
-| pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 |
-| OpenAI/text-embedding-ada-002 | 62.40 | 62.40 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 |
-| cl-nagoya/ruri-base | 62.38 | 62.38 |
-| oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 |
-| MU-Kindai/Japanese-DiffCSE-BERT-base | 62.38 | 62.38 |
-| jinaai/jina-embeddings-v3 | 62.37 | 62.37 |
-| pkshatech/GLuCoSE-base-ja-v2 | 62.37 | 62.37 |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 62.37 | 62.37 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 62.35 | 62.35 |
-| OpenAI/text-embedding-3-large | 62.35 | 62.35 |
-| MU-Kindai/Japanese-MixCSE-BERT-base | 62.33 | 62.33 |
-| sentence-transformers/LaBSE | 62.33 | 62.33 |
-| colorfulscoop/sbert-base-ja | 62.31 | 62.31 |
-| cl-nagoya/ruri-large | 62.29 | 62.29 |
-| OpenAI/text-embedding-3-small | 62.27 | 62.27 |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 62.27 | 62.27 |
-| intfloat/multilingual-e5-base | 62.26 | 62.26 |
-| sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 |
-| intfloat/multilingual-e5-small | 62.19 | 62.19 |
-| intfloat/multilingual-e5-large | 62.15 | 62.15 |
-| cl-nagoya/ruri-small | 62.11 | 62.11 |
-| sbintuitions/sarashina-embedding-v1-1b | 62.00 | 62.00 |
-| pkshatech/RoSEtta-base-ja | 61.74 | 61.74 |
+| Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) | sib200_jpn_clust
(v_measure_score) |
+|:----------------------------------------------|:---------:|:------------------------------------:|:------------------------------:|:---------------------------------------:|
+| cl-nagoya/sup-simcse-ja-base | **52.57** | 55.11 | **53.39** | 49.21 |
+| sbintuitions/sarashina-embedding-v2-1b | 52.56 | 57.41 | 51.67 | 48.59 |
+| OpenAI/text-embedding-3-large | 51.82 | 57.09 | 49.55 | 48.83 |
+| cl-nagoya/ruri-v3-130m | 51.13 | 54.36 | 48.84 | 50.20 |
+| cl-nagoya/ruri-large-v2 | 50.88 | 55.62 | 50.97 | 46.06 |
+| intfloat/multilingual-e5-large | 50.58 | 51.58 | 46.81 | **53.35** |
+| cl-nagoya/ruri-v3-310m | 50.52 | **58.56** | 48.60 | 44.41 |
+| sbintuitions/sarashina-embedding-v1-1b | 50.30 | 56.03 | 50.69 | 44.19 |
+| cl-nagoya/sup-simcse-ja-large | 50.12 | 53.38 | 51.26 | 45.74 |
+| intfloat/multilingual-e5-base | 50.12 | 53.79 | 49.44 | 47.13 |
+| cl-nagoya/ruri-small-v2 | 49.97 | 52.61 | 49.47 | 47.82 |
+| cl-nagoya/ruri-v3-70m | 49.95 | 54.92 | 47.74 | 47.20 |
+| cl-nagoya/ruri-large | 49.93 | 54.44 | 50.59 | 44.76 |
+| cl-nagoya/ruri-v3-30m | 49.90 | 53.69 | 47.96 | 48.04 |
+| cl-nagoya/ruri-small | 49.59 | 52.90 | 49.37 | 46.51 |
+| google/embeddinggemma-300m | 49.48 | 55.33 | 50.55 | 42.55 |
+| pkshatech/simcse-ja-bert-base-clcmlp | 49.45 | 49.11 | 47.02 | 52.21 |
+| intfloat/multilingual-e5-small | 49.29 | 51.94 | 52.34 | 43.59 |
+| cl-nagoya/ruri-base | 49.10 | 56.69 | 52.05 | 38.55 |
+| OpenAI/text-embedding-3-small | 48.91 | 54.57 | 47.55 | 44.59 |
+| OpenAI/text-embedding-ada-002 | 48.78 | 49.67 | 46.92 | 49.74 |
+| oshizo/sbert-jsnli-luke-japanese-base-lite | 48.75 | 51.70 | 51.52 | 43.03 |
+| pkshatech/GLuCoSE-base-ja-v2 | 48.19 | 54.46 | 46.12 | 43.98 |
+| pkshatech/GLuCoSE-base-ja | 47.12 | 50.41 | 49.52 | 41.43 |
+| cl-nagoya/ruri-base-v2 | 46.84 | 54.38 | 50.61 | 35.53 |
+| pkshatech/RoSEtta-base-ja | 44.88 | 48.89 | 45.16 | 40.61 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 44.54 | 51.30 | 46.27 | 36.04 |
+| cl-nagoya/unsup-simcse-ja-large | 43.52 | 51.48 | 44.44 | 34.65 |
+| MU-Kindai/Japanese-MixCSE-BERT-base | 43.45 | 48.56 | 43.20 | 38.60 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 42.86 | 45.84 | 44.08 | 38.67 |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 42.02 | 51.59 | 42.68 | 31.78 |
+| cl-nagoya/unsup-simcse-ja-base | 41.29 | 50.65 | 39.58 | 33.63 |
+| MU-Kindai/Japanese-DiffCSE-BERT-base | 39.93 | 46.01 | 39.22 | 34.56 |
+| sentence-transformers/LaBSE | 39.82 | 49.08 | 41.78 | 28.59 |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 39.27 | 48.79 | 42.61 | 26.42 |
+| colorfulscoop/sbert-base-ja | 39.04 | 40.60 | 46.18 | 30.36 |
+| hotchpotch/static-embedding-japanese | 35.91 | 51.44 | 34.81 | 21.47 |
+| sentence-transformers/stsb-xlm-r-multilingual | 27.67 | 26.62 | 32.05 | 24.34 |
diff --git a/make_leaderboard.py b/make_leaderboard.py
index 0e43ccf..5d472eb 100644
--- a/make_leaderboard.py
+++ b/make_leaderboard.py
@@ -9,14 +9,44 @@
"amazon_review_classification": "amazon_review",
"massive_intent_classification": "massive_intent",
"massive_scenario_classification": "massive_scenario",
+ "japanese_sentiment_classification": "jpn_sentiment",
+ "sib200_japanese_classification": "sib200_jpn_cls",
+ "sib200_japanese_clustering": "sib200_jpn_clust",
+ "nlp_journal_abs_article": "nlp_abs_article",
+ "nlp_journal_abs_intro": "nlp_abs_intro",
+ "nlp_journal_title_abs": "nlp_title_abs",
+ "nlp_journal_title_intro": "nlp_title_intro",
}
-TASK_ORDER = ["Retrieval", "STS", "Classification", "Reranking", "Clustering", "PairClassification"]
+TASK_ORDER = ["Retrieval", "STS", "Classification", "Reranking", "Clustering"]
SUMMARY_KEY = "Summary"
"""
Collects the results from the results folder.
"""
+# Load reference structure from sbintuitions/sarashina-embedding-v1-1b/summary.json
+reference_file = Path("docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json")
+with open(reference_file) as f:
+ reference_structure = json.load(f)
+
+# Extract the expected structure
+expected_structure = {}
+for task_name, task_results in reference_structure.items():
+ expected_structure[task_name] = set(task_results.keys())
+
+
+def has_same_structure(summary: dict, expected: dict) -> bool:
+ """Check if summary has exactly the same structure as expected."""
+ if set(summary.keys()) != set(expected.keys()):
+ return False
+
+ for task_name, datasets in expected.items():
+ if set(summary[task_name].keys()) != datasets:
+ return False
+
+ return True
+
+
# {task_name: {model_signature: {(dataset_name, metric_name): score}}}
all_results: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict))
for summary_file in Path("docs/results").rglob("summary.json"):
@@ -26,6 +56,13 @@
with open(summary_file) as f:
summary = json.load(f)
+ # Skip models that don't have the same structure as reference
+ if not has_same_structure(summary, expected_structure):
+ org_name = summary_file.parent.parent.name
+ model_name = summary_file.parent.name
+ print(f"Skipping {org_name}/{model_name}: different structure")
+ continue
+
org_name = summary_file.parent.parent.name
model_name = summary_file.parent.name
model_signature = f"{org_name}/{model_name}"
@@ -56,17 +93,24 @@ def format_score(score: float) -> str:
# format to markdown table
dataset_keys = list(task_results[next(iter(task_results))].keys())
if task_name == SUMMARY_KEY:
- dataset_keys = TASK_ORDER
+ # Only include existing tasks in the summary
+ dataset_keys = [task for task in TASK_ORDER if task in all_results]
header = ["Model", AVG_COLUMN_NAME, *dataset_keys]
table_list: list[list[str | float]] = []
for model_signature, dataset_scores in task_results.items():
+ # Skip models that don't have all required datasets
+ if not all(k in dataset_scores for k in dataset_keys):
+ continue
+
model_scores = [dataset_scores[k] for k in dataset_keys]
if task_name == SUMMARY_KEY:
scores_by_dataset = []
for _task_name, _task_results in all_results.items():
- if _task_name != SUMMARY_KEY:
+ if _task_name != SUMMARY_KEY and model_signature in _task_results:
scores_by_dataset.extend(list(_task_results[model_signature].values()))
+ if not scores_by_dataset: # Skip if no scores available
+ continue
average_score = sum(scores_by_dataset) / len(scores_by_dataset)
else:
average_score = sum(model_scores) / len(model_scores)
@@ -88,7 +132,9 @@ def format_score(score: float) -> str:
# add header
table_list.insert(0, ["Model", AVG_COLUMN_NAME, *dataset_keys])
- markdown_table = tabulate(table_list, headers="firstrow", tablefmt="pipe")
+ # Set alignment: left for model names, center for all numeric columns
+ col_alignment = ["left"] + ["center"] * (len(dataset_keys) + 1)
+ markdown_table = tabulate(table_list, headers="firstrow", tablefmt="pipe", colalign=col_alignment)
markdown_tables[task_name] = markdown_table
"""
@@ -100,6 +146,8 @@ def format_score(score: float) -> str:
"This leaderboard shows the results stored under `docs/results`. The scores are all multiplied by 100.\n\n"
)
for task_name in [SUMMARY_KEY, *TASK_ORDER]:
+ if task_name not in markdown_tables:
+ continue
markdown_table = markdown_tables[task_name]
f.write(f"## {task_name}\n")
diff --git a/src/jmteb/__main__.py b/src/jmteb/__main__.py
index ff10884..2dc9478 100644
--- a/src/jmteb/__main__.py
+++ b/src/jmteb/__main__.py
@@ -119,6 +119,7 @@ def main(
)
if args.log_predictions:
+ logger.info("Prediction logging activated.")
for k, v in args.evaluators.items():
if hasattr(v, "log_predictions"):
args.evaluators[k].log_predictions = True
diff --git a/src/jmteb/configs/jmteb.jsonnet b/src/jmteb/configs/jmteb.jsonnet
index 66fd2dc..b27d021 100644
--- a/src/jmteb/configs/jmteb.jsonnet
+++ b/src/jmteb/configs/jmteb.jsonnet
@@ -3,14 +3,16 @@
(import './tasks/amazon_counterfactual_classification.jsonnet') +
(import './tasks/massive_intent_classification.jsonnet') +
(import './tasks/massive_scenario_classification.jsonnet') +
+(import './tasks/japanese_sentiment_classification.jsonnet') +
+(import './tasks/sib200_japanese_classification.jsonnet') +
+(import './tasks/wrime_classification.jsonnet') +
// Clustering
(import './tasks/livedoor_news.jsonnet') +
(import './tasks/mewsc16.jsonnet') +
+(import './tasks/sib200_japanese_clustering.jsonnet') +
// STS
(import './tasks/jsts.jsonnet') +
(import './tasks/jsick.jsonnet') +
-// Pair Classification
-(import './tasks/paws_x_ja.jsonnet') +
// Retrieval
(import './tasks/jagovfaqs_22k.jsonnet') +
(import './tasks/mrtydi.jsonnet') +
@@ -18,5 +20,14 @@
(import './tasks/nlp_journal_title_abs.jsonnet') +
(import './tasks/nlp_journal_title_intro.jsonnet') +
(import './tasks/nlp_journal_abs_intro.jsonnet') +
+(import './tasks/nlp_journal_abs_article.jsonnet') +
+(import './tasks/jacwir_retrieval.jsonnet') +
+(import './tasks/miracl_retrieval.jsonnet') +
+(import './tasks/mldr_retrieval.jsonnet') +
+(import './tasks/mintaka_retrieval.jsonnet') +
// Reranking
-(import './tasks/esci.jsonnet')
\ No newline at end of file
+(import './tasks/esci.jsonnet') +
+(import './tasks/jqara.jsonnet') +
+(import './tasks/jacwir_reranking.jsonnet') +
+(import './tasks/miracl_reranking.jsonnet') +
+(import './tasks/mldr_reranking.jsonnet')
\ No newline at end of file
diff --git a/src/jmteb/configs/tasks/jacwir_reranking.jsonnet b/src/jmteb/configs/tasks/jacwir_reranking.jsonnet
new file mode 100644
index 0000000..eb41d67
--- /dev/null
+++ b/src/jmteb/configs/tasks/jacwir_reranking.jsonnet
@@ -0,0 +1,31 @@
+{
+ jacwir_reranking: {
+ class_path: 'RerankingEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'jacwir-reranking-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'jacwir-reranking-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRerankingDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'jacwir-reranking-corpus',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet b/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet
new file mode 100644
index 0000000..8cdb416
--- /dev/null
+++ b/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet
@@ -0,0 +1,32 @@
+{
+ jacwir_retrieval: {
+ class_path: 'RetrievalEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'jacwir-retrieval-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'jacwir-retrieval-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRetrievalDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'jacwir-retrieval-corpus',
+ },
+ },
+ "doc_chunk_size":10000
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet b/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet
new file mode 100644
index 0000000..f5a847c
--- /dev/null
+++ b/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet
@@ -0,0 +1,31 @@
+{
+ japanese_sentiment_classification: {
+ class_path: 'ClassificationEvaluator',
+ init_args: {
+ train_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'train',
+ name: 'japanese_sentiment_classification',
+ },
+ },
+ val_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'japanese_sentiment_classification',
+ },
+ },
+ test_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'japanese_sentiment_classification',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/jqara.jsonnet b/src/jmteb/configs/tasks/jqara.jsonnet
new file mode 100644
index 0000000..1c0ba64
--- /dev/null
+++ b/src/jmteb/configs/tasks/jqara.jsonnet
@@ -0,0 +1,31 @@
+{
+ jqara: {
+ class_path: 'RerankingEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'jqara-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'jqara-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRerankingDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'jqara-corpus',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet b/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet
new file mode 100644
index 0000000..6b17949
--- /dev/null
+++ b/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet
@@ -0,0 +1,32 @@
+{
+ mintaka_retrieval: {
+ class_path: 'RetrievalEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'mintaka-retrieval-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'mintaka-retrieval-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRetrievalDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'mintaka-retrieval-corpus',
+ },
+ },
+ "doc_chunk_size":10000
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/miracl_reranking.jsonnet b/src/jmteb/configs/tasks/miracl_reranking.jsonnet
new file mode 100644
index 0000000..b91a341
--- /dev/null
+++ b/src/jmteb/configs/tasks/miracl_reranking.jsonnet
@@ -0,0 +1,31 @@
+{
+ miracl_reranking: {
+ class_path: 'RerankingEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'miracl-reranking-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'miracl-reranking-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRerankingDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'miracl-reranking-corpus',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/miracl_retrieval.jsonnet b/src/jmteb/configs/tasks/miracl_retrieval.jsonnet
new file mode 100644
index 0000000..9b73f4f
--- /dev/null
+++ b/src/jmteb/configs/tasks/miracl_retrieval.jsonnet
@@ -0,0 +1,32 @@
+{
+ miracl_retrieval: {
+ class_path: 'RetrievalEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'miracl-retrieval-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'miracl-retrieval-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRetrievalDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'miracl-retrieval-corpus',
+ },
+ },
+ "doc_chunk_size":10000
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/mldr_reranking.jsonnet b/src/jmteb/configs/tasks/mldr_reranking.jsonnet
new file mode 100644
index 0000000..1cbc025
--- /dev/null
+++ b/src/jmteb/configs/tasks/mldr_reranking.jsonnet
@@ -0,0 +1,31 @@
+{
+ mldr_reranking: {
+ class_path: 'RerankingEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'mldr-reranking-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRerankingQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'mldr-reranking-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRerankingDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'mldr-reranking-corpus',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/mldr_retrieval.jsonnet b/src/jmteb/configs/tasks/mldr_retrieval.jsonnet
new file mode 100644
index 0000000..71c0bee
--- /dev/null
+++ b/src/jmteb/configs/tasks/mldr_retrieval.jsonnet
@@ -0,0 +1,32 @@
+{
+ mldr_retrieval: {
+ class_path: 'RetrievalEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'mldr-retrieval-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'mldr-retrieval-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRetrievalDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'mldr-retrieval-corpus',
+ },
+ },
+ "doc_chunk_size":10000
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet b/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet
new file mode 100644
index 0000000..f2c175f
--- /dev/null
+++ b/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet
@@ -0,0 +1,31 @@
+{
+ nlp_journal_abs_article: {
+ class_path: 'RetrievalEvaluator',
+ init_args: {
+ val_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'nlp_journal_abs_article-query',
+ },
+ },
+ test_query_dataset: {
+ class_path: 'HfRetrievalQueryDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'nlp_journal_abs_article-query',
+ },
+ },
+ doc_dataset: {
+ class_path: 'HfRetrievalDocDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'corpus',
+ name: 'nlp_journal_abs_article-corpus',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet b/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet
new file mode 100644
index 0000000..852505f
--- /dev/null
+++ b/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet
@@ -0,0 +1,31 @@
+{
+ sib200_japanese_classification: {
+ class_path: 'ClassificationEvaluator',
+ init_args: {
+ train_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'train',
+ name: 'sib200_japanese_classification',
+ },
+ },
+ val_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'sib200_japanese_classification',
+ },
+ },
+ test_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'sib200_japanese_classification',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/configs/tasks/paws_x_ja.jsonnet b/src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet
similarity index 53%
rename from src/jmteb/configs/tasks/paws_x_ja.jsonnet
rename to src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet
index ee57b72..762d34a 100644
--- a/src/jmteb/configs/tasks/paws_x_ja.jsonnet
+++ b/src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet
@@ -1,21 +1,21 @@
{
- paws_x_ja: {
- class_path: 'PairClassificationEvaluator',
+ sib200_japanese_clustering: {
+ class_path: 'ClusteringEvaluator',
init_args: {
val_dataset: {
- class_path: 'HfPairClassificationDataset',
+ class_path: 'HfClusteringDataset',
init_args: {
path: 'sbintuitions/JMTEB',
split: 'validation',
- name: 'paws_x_ja',
+ name: 'sib200_japanese_clustering',
},
},
test_dataset: {
- class_path: 'HfPairClassificationDataset',
+ class_path: 'HfClusteringDataset',
init_args: {
path: 'sbintuitions/JMTEB',
split: 'test',
- name: 'paws_x_ja',
+ name: 'sib200_japanese_clustering',
},
},
},
diff --git a/src/jmteb/configs/tasks/wrime_classification.jsonnet b/src/jmteb/configs/tasks/wrime_classification.jsonnet
new file mode 100644
index 0000000..7fb68b7
--- /dev/null
+++ b/src/jmteb/configs/tasks/wrime_classification.jsonnet
@@ -0,0 +1,31 @@
+{
+ wrime_classification: {
+ class_path: 'ClassificationEvaluator',
+ init_args: {
+ train_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'train',
+ name: 'wrime_classification',
+ },
+ },
+ val_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'validation',
+ name: 'wrime_classification',
+ },
+ },
+ test_dataset: {
+ class_path: 'HfClassificationDataset',
+ init_args: {
+ path: 'sbintuitions/JMTEB',
+ split: 'test',
+ name: 'wrime_classification',
+ },
+ },
+ },
+ },
+}
diff --git a/src/jmteb/embedders/__init__.py b/src/jmteb/embedders/__init__.py
index f28f038..5a3e19c 100644
--- a/src/jmteb/embedders/__init__.py
+++ b/src/jmteb/embedders/__init__.py
@@ -2,6 +2,8 @@
from jmteb.embedders.data_parallel_sbert_embedder import (
DataParallelSentenceBertEmbedder,
)
+from jmteb.embedders.gemma_embedder import GemmaEmbedder
from jmteb.embedders.openai_embedder import OpenAIEmbedder
+from jmteb.embedders.plamo_embedder import PlamoEmbedder
from jmteb.embedders.sbert_embedder import SentenceBertEmbedder
from jmteb.embedders.transformers_embedder import TransformersEmbedder
diff --git a/src/jmteb/embedders/base.py b/src/jmteb/embedders/base.py
index ea078f1..42a5d54 100644
--- a/src/jmteb/embedders/base.py
+++ b/src/jmteb/embedders/base.py
@@ -144,3 +144,17 @@ def set_output_tensor(self):
def set_output_numpy(self):
self.convert_to_numpy = True
self.convert_to_tensor = False
+
+ def set_max_seq_length(self, max_seq_length: int | None = None) -> None:
+ if hasattr(self, "max_seq_length"):
+ self.max_seq_length = max_seq_length
+ else:
+ logger.warning("Embedder doesn't have a `max_seq_length` attribute!")
+
+ def reset_max_seq_length(self):
+ orig_max_seq_length = getattr(self, "_orig_max_length", None)
+ if not orig_max_seq_length:
+ logger.warning("Failed to reset `max_seq_length`!")
+ else:
+ logger.info(f"Set `max_seq_length` to model default: {orig_max_seq_length}")
+ self.max_seq_length = orig_max_seq_length
diff --git a/src/jmteb/embedders/data_parallel_sbert_embedder.py b/src/jmteb/embedders/data_parallel_sbert_embedder.py
index 7416fe4..5d932e2 100644
--- a/src/jmteb/embedders/data_parallel_sbert_embedder.py
+++ b/src/jmteb/embedders/data_parallel_sbert_embedder.py
@@ -201,6 +201,7 @@ def __init__(
)
self.dp_model = DPSentenceTransformer(sbert_model=model)
self.model = self.dp_model.sbert
+ self._orig_max_length = self.model.max_seq_length
if max_seq_length:
self.model.max_seq_length = max_seq_length
self.initital_batch_size = batch_size
@@ -258,3 +259,7 @@ def _add_eos_func(self, text: str | list[str]) -> str | list[str]:
def get_output_dim(self) -> int:
return self.model.get_sentence_embedding_dimension()
+
+ def reset_max_seq_length(self):
+ logger.info(f"Reset `max_seq_length` to {self._orig_max_length}")
+ self.model.max_seq_length = self._orig_max_length
diff --git a/src/jmteb/embedders/gemma_embedder.py b/src/jmteb/embedders/gemma_embedder.py
new file mode 100644
index 0000000..5949845
--- /dev/null
+++ b/src/jmteb/embedders/gemma_embedder.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+import numpy as np
+import torch
+from loguru import logger
+from sentence_transformers import SentenceTransformer
+
+from jmteb.embedders.base import TextEmbedder
+
+
+class GemmaEmbedder(TextEmbedder):
+ """
+ Google EmbeddingGemma model embedder using SentenceTransformers.
+
+ This class supports the EmbeddingGemma models from Google (e.g., embeddinggemma-300m).
+ It uses SentenceTransformers to load the model and provides specialized encode_query
+ and encode_document methods for optimal performance in different use cases.
+ """
+
+ def __init__(
+ self,
+ model_name_or_path: str = "google/embeddinggemma-300m",
+ batch_size: int = 32,
+ device: str | None = None,
+ normalize_embeddings: bool = True,
+ max_seq_length: int | None = None,
+ query_mode: bool = False,
+ add_eos: bool = False,
+ truncate_dim: int | None = None,
+ model_kwargs: dict | None = None,
+ tokenizer_kwargs: dict | None = None,
+ ) -> None:
+ """
+ Initialize the EmbeddingGemma embedder using SentenceTransformers.
+
+ Args:
+ model_name_or_path: Path or name of the EmbeddingGemma model
+ batch_size: Batch size for encoding
+ device: Device to use ('cuda', 'cpu', or None for auto)
+ normalize_embeddings: Whether to normalize embeddings (recommended for EmbeddingGemma)
+ max_seq_length: Maximum sequence length (default: model's max, typically 2048)
+ query_mode: Whether to use query encoding mode by default
+ add_eos: Whether to add EOS token to inputs
+ truncate_dim: Truncate embeddings to this dimension (supports 768, 512, 256, 128)
+ model_kwargs: Additional kwargs for model loading
+ tokenizer_kwargs: Additional kwargs for tokenizer loading
+ """
+ model_kwargs = self._model_kwargs_parser(model_kwargs or {})
+
+ # Initialize SentenceTransformer
+ self.model = SentenceTransformer(
+ model_name_or_path,
+ trust_remote_code=True,
+ truncate_dim=truncate_dim,
+ model_kwargs=model_kwargs,
+ tokenizer_kwargs=tokenizer_kwargs or {},
+ )
+
+ # Store original max length and set new one if provided
+ self._orig_max_length = self.model.max_seq_length
+ if max_seq_length:
+ self.model.max_seq_length = max_seq_length
+
+ self.batch_size = batch_size
+ self.device = device
+ self.normalize_embeddings = normalize_embeddings
+ self.max_seq_length = getattr(self.model, "max_seq_length", None)
+ self.add_eos = add_eos
+ self.query_mode = query_mode
+
+ # Set output format based on model kwargs
+ if model_kwargs and "torch_dtype" in model_kwargs:
+ self.set_output_tensor()
+ else:
+ self.set_output_numpy()
+
+ logger.info(f"Loaded EmbeddingGemma model: {model_name_or_path}")
+ logger.info(f"Model device: {self.model.device}, Max seq length: {self.max_seq_length}")
+
+ def encode(self, text: str | list[str], prefix: str | None = None, **kwargs) -> np.ndarray | torch.Tensor:
+ """
+ Encode text into embeddings using EmbeddingGemma's specialized methods.
+
+ This method is compatible with the base TextEmbedder interface and works
+ seamlessly with batch_encode_with_cache.
+
+ Args:
+ text: Input text(s) to encode
+ prefix: Prefix to add to texts
+ **kwargs: Additional arguments (supports query_mode for specialized encoding)
+
+ Returns:
+ Embeddings as numpy array or torch tensor
+ """
+ if isinstance(text, str):
+ text = [text]
+ text_was_str = True
+ else:
+ text_was_str = False
+
+ # Check for query_mode in kwargs, otherwise use instance default
+ use_query_mode = kwargs.get("query_mode", self.query_mode)
+
+ # Apply prefix if provided
+ if prefix:
+ text = [prefix + t for t in text]
+
+ if self.add_eos:
+ text = self._add_eos_func(text)
+
+ # Use specialized encoding methods if available
+ if hasattr(self.model, "encode_query") and hasattr(self.model, "encode_document"):
+ if use_query_mode:
+ embeddings = self.model.encode_query(text)
+ else:
+ embeddings = self.model.encode_document(text)
+
+ # Convert to appropriate format
+ if self.convert_to_numpy and isinstance(embeddings, torch.Tensor):
+ embeddings = embeddings.cpu().numpy()
+ elif not self.convert_to_numpy and isinstance(embeddings, np.ndarray):
+ embeddings = torch.from_numpy(embeddings)
+ else:
+ # Fallback to standard SentenceTransformer encode method
+ embeddings = self.model.encode(
+ text,
+ convert_to_numpy=self.convert_to_numpy,
+ convert_to_tensor=self.convert_to_tensor,
+ batch_size=self.batch_size,
+ device=self.device,
+ normalize_embeddings=self.normalize_embeddings,
+ **kwargs,
+ )
+
+ if text_was_str:
+ if isinstance(embeddings, np.ndarray) and embeddings.ndim > 1:
+ embeddings = embeddings[0]
+ elif isinstance(embeddings, torch.Tensor) and embeddings.ndim > 1:
+ embeddings = embeddings[0]
+
+ return embeddings
+
+ def encode_queries(
+ self, queries: str | list[str], prefix: str | None = None, **kwargs
+ ) -> np.ndarray | torch.Tensor:
+ """
+ Convenience method to encode queries using query mode.
+
+ Args:
+ queries: Query text(s) to encode
+ prefix: Prefix to add
+ **kwargs: Additional arguments
+
+ Returns:
+ Query embeddings
+ """
+ return self.encode(queries, prefix=prefix, query_mode=True, **kwargs)
+
+ def encode_documents(
+ self, documents: str | list[str], prefix: str | None = None, **kwargs
+ ) -> np.ndarray | torch.Tensor:
+ """
+ Convenience method to encode documents using document mode.
+
+ Args:
+ documents: Document text(s) to encode
+ prefix: Prefix to add
+ **kwargs: Additional arguments
+
+ Returns:
+ Document embeddings
+ """
+ return self.encode(documents, prefix=prefix, query_mode=False, **kwargs)
+
+ def set_query_mode(self, query_mode: bool = True) -> None:
+ """
+ Set the default encoding mode.
+
+ Args:
+ query_mode: True for query mode, False for document mode
+ """
+ self.query_mode = query_mode
+ logger.info(f"Set default encoding mode to {'query' if query_mode else 'document'}")
+
+ def _add_eos_func(self, text: str | list[str]) -> str | list[str]:
+ """Add EOS token to text if available."""
+ try:
+ eos_token = getattr(self.model.tokenizer, "eos_token")
+ except AttributeError:
+ return text
+
+ if isinstance(text, str):
+ return text + eos_token
+ elif isinstance(text, list):
+ return [t + eos_token for t in text]
+ return text
+
+ def get_output_dim(self) -> int:
+ """Get the dimensionality of output embeddings."""
+ return self.model.get_sentence_embedding_dimension()
+
+ def set_max_seq_length(self, max_seq_length: int | None = None) -> None:
+ """Set maximum sequence length."""
+ if max_seq_length:
+ self.model.max_seq_length = max_seq_length
+ self.max_seq_length = max_seq_length
+ logger.info(f"Set max_seq_length to {max_seq_length}")
+
+ def reset_max_seq_length(self) -> None:
+ """Reset max sequence length to model's original value."""
+ try:
+ logger.info(f"Reset max_seq_length to {self._orig_max_length}")
+ self.model.max_seq_length = self._orig_max_length
+ self.max_seq_length = self._orig_max_length
+ except AttributeError:
+ logger.warning("Failed to reset max_seq_length - original value not available")
+
+ def __repr__(self) -> str:
+ return f"GemmaEmbedder(model='{self.model.model_name}', device='{self.model.device}')"
diff --git a/src/jmteb/embedders/openai_embedder.py b/src/jmteb/embedders/openai_embedder.py
index 6ea8b8f..029b0a5 100644
--- a/src/jmteb/embedders/openai_embedder.py
+++ b/src/jmteb/embedders/openai_embedder.py
@@ -2,7 +2,11 @@
from dataclasses import dataclass
+from os import PathLike
+from pathlib import Path
+
import numpy as np
+import tqdm
import tiktoken
from loguru import logger
from openai import OpenAI
@@ -14,7 +18,7 @@
class OpenAIEmbedderConfig:
max_output_dim: int
encoder_name: str
- max_token_length: int
+ max_seq_length: int
OPENAI_EMBEDDERS = {
@@ -28,7 +32,12 @@ class OpenAIEmbedderConfig:
class OpenAIEmbedder(TextEmbedder):
"""Embedder via OpenAI API."""
- def __init__(self, model: str = "text-embedding-3-small", dim: int | None = None) -> None:
+ def __init__(
+ self,
+ model: str = "text-embedding-3-small",
+ dim: int | None = None,
+ max_seq_length: int | None = None,
+ ) -> None:
"""Setup.
model and dim: see https://platform.openai.com/docs/models/embeddings
`text-embedding-3-large` model: max 3072 dim
@@ -44,13 +53,19 @@ def __init__(self, model: str = "text-embedding-3-small", dim: int | None = None
Args:
model (str, optional): Name of an OpenAI embedding model. Defaults to "text-embedding-3-small".
dim (int, optional): Output dimension. Defaults to 1536.
+ max_seq_length (int, optional): Maximum length of sequences. Default to None.
"""
self.client = OpenAI() # API key written in .env
assert model in OPENAI_EMBEDDERS.keys(), f"`model` must be one of {list(OPENAI_EMBEDDERS.keys())}!"
self.model = model
model_config = OPENAI_EMBEDDERS[model]
self.encoding = tiktoken.get_encoding(model_config.encoder_name)
- self.max_token_length = model_config.max_token_length
+ self._orig_max_length = model_config.max_seq_length
+ if max_seq_length:
+ self.max_seq_length = max_seq_length
+ else:
+ self.max_seq_length = model_config.max_seq_length
+
if not dim or model == "text-embedding-ada-002":
self.dim = model_config.max_output_dim
else:
@@ -70,16 +85,22 @@ def encode(self, text: str | list[str], prefix: str | None = None) -> np.ndarray
token_ids: list[int] = self.encode_and_truncate_text(text, prefix)
else:
token_ids: list[list[int]] = [self.encode_and_truncate_text(t, prefix) for t in text]
- result = np.asarray(
- [
- data.embedding
- for data in self.client.embeddings.create(
- input=token_ids,
- model=self.model,
- **kwargs,
- ).data
- ]
- )
+ try:
+ result = np.asarray(
+ [
+ data.embedding
+ for data in self.client.embeddings.create(
+ input=token_ids,
+ model=self.model,
+ **kwargs,
+ ).data
+ ]
+ )
+ except Exception as e:
+ logger.error(f"{len(text)=}")
+ logger.error(f"{len(token_ids)=}")
+ raise e
+
if result.shape[0] == 1:
return result.reshape(-1)
return result
@@ -94,4 +115,86 @@ def encode_and_truncate_text(self, text: str, prefix: str | None = None) -> list
text = " "
logger.warning("Found empty string!")
# Ignore prefix in OpenAIEmbedder
- return self.encoding.encode(text)[: self.max_token_length]
+ return self.encoding.encode(text)[: self.max_seq_length]
+
+ def _batch_encode_and_save_on_disk(
+ self,
+ text_list: list[str],
+ save_path: str | PathLike[str],
+ prefix: str | None = None,
+ batch_size: int = 256,
+ dtype: str = "float32",
+ **kwargs,
+ ) -> np.memmap:
+ """
+ Encode a list of texts and save the embeddings on disk using memmap.
+
+ Args:
+ text_list (list[str]): list of texts
+ save_path (str): path to save the embeddings
+ prefix (str, optional): the prefix to use for encoding. Default to None.
+ dtype (str, optional): data type. Defaults to "float32".
+ batch_size (int): batch size. Defaults to 64.
+ """
+
+ batch_size = 512
+ num_samples = len(text_list)
+ output_dim = self.get_output_dim()
+ embeddings = np.memmap(save_path, dtype=dtype, mode="w+", shape=(num_samples, output_dim))
+
+ with tqdm.tqdm(total=num_samples, desc="Encoding") as pbar:
+ for i in range(0, num_samples, batch_size):
+ batch = text_list[i : i + batch_size]
+ try:
+ batch_embeddings: np.ndarray = self.encode(batch, prefix=prefix, **kwargs)
+ except Exception:
+ logger.error(f"{batch_size=}, {len(batch)=}")
+ logger.warning("Batch too large, retrying with batch size 16")
+ # Retry with batch size 16
+ small_batch_size = 16
+ batch_embeddings_list = []
+ for j in range(0, len(batch), small_batch_size):
+ small_batch = batch[j : j + small_batch_size]
+ small_batch_embeddings = self.encode(small_batch, prefix=prefix, **kwargs)
+ batch_embeddings_list.append(small_batch_embeddings)
+ batch_embeddings = np.vstack(batch_embeddings_list)
+ embeddings[i : i + batch_size] = batch_embeddings
+ pbar.update(len(batch))
+
+ embeddings.flush()
+ return np.memmap(save_path, dtype=dtype, mode="r", shape=(num_samples, output_dim))
+
+ def batch_encode_with_cache(
+ self,
+ text_list: list[str],
+ prefix: str | None = None,
+ cache_path: str | PathLike[str] | None = None,
+ overwrite_cache: bool = False,
+ dtype: str = "float32",
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Encode a list of texts and save the embeddings on disk using memmap if cache_path is provided.
+
+ Args:
+ text_list (list[str]): list of texts
+ prefix (str, optional): the prefix to use for encoding. Default to None.
+ cache_path (str, optional): path to save the embeddings. Defaults to None.
+ overwrite_cache (bool, optional): whether to overwrite the cache. Defaults to False.
+ dtype (str, optional): data type. Defaults to "float32".
+ """
+
+ logger.warning(f"Encoding with OpenAI embedder. {kwargs=}")
+ if cache_path is None:
+ logger.info("Encoding embeddings")
+ return self.encode(text_list, prefix=prefix, **kwargs)
+
+ if Path(cache_path).exists() and not overwrite_cache:
+ logger.info(f"Loading embeddings from {cache_path}")
+ return np.memmap(cache_path, dtype=dtype, mode="r", shape=(len(text_list), self.get_output_dim()))
+
+ logger.info(f"Encoding and saving embeddings to {cache_path}")
+ embeddings = self._batch_encode_and_save_on_disk(
+ text_list, cache_path, prefix=prefix, batch_size=self._chunk_size, dtype=dtype, **kwargs
+ )
+ return embeddings
diff --git a/src/jmteb/embedders/plamo_embedder.py b/src/jmteb/embedders/plamo_embedder.py
new file mode 100644
index 0000000..f2c6755
--- /dev/null
+++ b/src/jmteb/embedders/plamo_embedder.py
@@ -0,0 +1,251 @@
+import numpy as np
+import torch
+from loguru import logger
+from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
+
+from jmteb.embedders.base import TextEmbedder
+
+
+class PlamoEmbedder(TextEmbedder):
+ """
+ PLaMO embedding model embedder with multi-GPU support.
+
+ This class supports the PLaMO-Embedding-1B model from Preferred Networks.
+ It uses the model's specialized encode_query and encode_document methods
+ for optimal performance in different use cases.
+ """
+
+ def __init__(
+ self,
+ model_name_or_path: str = "pfnet/plamo-embedding-1b",
+ batch_size: int = 2,
+ device: str | None = None,
+ normalize_embeddings: bool = False,
+ max_seq_length: int | None = None,
+ query_mode: bool = False,
+ model_kwargs: dict = {},
+ tokenizer_kwargs: dict = {},
+ ) -> None:
+ """
+ Initialize the PLaMO embedder.
+
+ Args:
+ model_name_or_path: Path or name of the PLaMO model
+ batch_size: Batch size for encoding
+ device: Device to use ('cuda', 'cpu', or None for auto)
+ normalize_embeddings: Whether to normalize embeddings
+ max_seq_length: Maximum sequence length (default: model's max)
+ query_mode: Whether to use query encoding mode by default
+ model_kwargs: Additional kwargs for model loading
+ tokenizer_kwargs: Additional kwargs for tokenizer loading
+ """
+ model_kwargs = self._model_kwargs_parser(model_kwargs)
+
+ # Load model and tokenizer with trust_remote_code=True for PLaMO
+ self.model: PreTrainedModel = AutoModel.from_pretrained(
+ model_name_or_path, trust_remote_code=True, **model_kwargs
+ )
+ self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
+ model_name_or_path, trust_remote_code=True, **tokenizer_kwargs
+ )
+
+ self.batch_size = batch_size
+ self.normalize_embeddings = normalize_embeddings
+ self.query_mode = query_mode
+
+ # Set up device
+ if not device and torch.cuda.is_available():
+ self.device = "cuda"
+ else:
+ self.device = device or "cpu"
+
+ # Move model to device
+ self.model.to(self.device)
+
+ # Enable simple multi-GPU support with DataParallel if multiple GPUs available
+ if torch.cuda.device_count() > 1 and self.device == "cuda":
+ logger.info(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
+ self.model = torch.nn.DataParallel(self.model)
+ self.is_data_parallel = True
+ self.distributed_state = True # For compatibility with tests
+ else:
+ self.is_data_parallel = False
+ self.distributed_state = None
+
+ # Store the device for easy access
+ self.model_device = next(self.model.parameters()).device
+ logger.info(f"Model device: {self.model_device}, GPU count: {torch.cuda.device_count()}")
+
+ # Set up sequence length
+ self._orig_max_length = getattr(
+ self.model.config if not self.is_data_parallel else self.model.module.config,
+ "max_position_embeddings",
+ 4096,
+ )
+ self.max_seq_length = max_seq_length or self._orig_max_length
+
+ # PLaMO-Embedding-1B has 2048 embedding dimensions
+ self.output_dim = getattr(
+ self.model.config if not self.is_data_parallel else self.model.module.config, "hidden_size", 2048
+ )
+
+ # Set output format based on model kwargs
+ if "torch_dtype" in model_kwargs:
+ self.set_output_tensor()
+ else:
+ self.set_output_numpy()
+
+ def get_output_dim(self) -> int:
+ """Get the dimensionality of output embeddings."""
+ return self.output_dim
+
+ def encode(self, text: str | list[str], prefix: str | None = None, **kwargs) -> np.ndarray | torch.Tensor:
+ """
+ Encode text into embeddings using PLaMO's specialized methods.
+
+ This method is compatible with the base TextEmbedder interface and works
+ seamlessly with batch_encode_with_cache.
+
+ Args:
+ text: Input text(s) to encode
+ prefix: Prefix to add to texts
+ **kwargs: Additional arguments (supports query_mode for specialized encoding)
+
+ Returns:
+ Embeddings as numpy array or torch tensor
+ """
+ if isinstance(text, str):
+ text = [text]
+ text_was_str = True
+ else:
+ text_was_str = False
+
+ # Check for query_mode in kwargs, otherwise use instance default
+ use_query_mode = kwargs.get("query_mode", self.query_mode)
+
+ # Apply prefix if provided
+ if prefix:
+ text = [prefix + t for t in text]
+
+ # Encode using PLaMO's specialized methods
+ with torch.inference_mode():
+ embeddings = self._encode_batch(text, use_query_mode)
+
+ # Apply normalization if requested
+ if self.normalize_embeddings:
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+ if text_was_str:
+ res = embeddings.view(-1)
+ else:
+ res = embeddings
+
+ if self.convert_to_numpy:
+ return res.cpu().numpy() if res.is_cuda else res.numpy()
+ else:
+ return res
+
+ def _encode_batch(self, text: list[str], query_mode: bool = False) -> torch.Tensor:
+ """
+ Encode a batch of texts using PLaMO's specialized methods with memory optimization.
+
+ Args:
+ text: List of texts to encode
+ query_mode: Whether to use query or document encoding
+
+ Returns:
+ Batch embeddings as torch tensor
+ """
+ if len(text) == 0:
+ return torch.empty(0, self.output_dim, device=self.model_device)
+
+ # Process in reasonable chunks for PLaMO
+ chunk_size = self.batch_size
+ all_embeddings = []
+
+ # Get the actual model (handle DataParallel wrapper)
+ actual_model = self.model.module if self.is_data_parallel else self.model
+
+ with torch.inference_mode():
+ for i in range(0, len(text), chunk_size):
+ chunk = text[i : i + chunk_size]
+
+ try:
+ if query_mode:
+ # Use PLaMO's encode_query method for queries
+ chunk_embeddings = actual_model.encode_query(chunk, self.tokenizer)
+ else:
+ # Use PLaMO's encode_document method for documents
+ chunk_embeddings = actual_model.encode_document(chunk, self.tokenizer)
+
+ # Keep embeddings on device
+ all_embeddings.append(chunk_embeddings)
+
+ except torch.cuda.OutOfMemoryError:
+ # If still OOM, try processing one by one
+ logger.warning(f"OOM with chunk size {len(chunk)}, falling back to single item processing")
+ torch.cuda.empty_cache()
+
+ for single_text in chunk:
+ if query_mode:
+ single_embedding = actual_model.encode_query([single_text], self.tokenizer)
+ else:
+ single_embedding = actual_model.encode_document([single_text], self.tokenizer)
+ all_embeddings.append(single_embedding)
+ torch.cuda.empty_cache()
+
+ # Concatenate all embeddings
+ if all_embeddings:
+ return torch.cat(all_embeddings, dim=0)
+ else:
+ return torch.empty(0, self.output_dim, device=self.model_device)
+
+ def encode_queries(
+ self, queries: str | list[str], prefix: str | None = None, **kwargs
+ ) -> np.ndarray | torch.Tensor:
+ """
+ Convenience method to encode queries using query mode.
+
+ Args:
+ queries: Query text(s) to encode
+ prefix: Prefix to add
+ **kwargs: Additional arguments
+
+ Returns:
+ Query embeddings
+ """
+ return self.encode(queries, prefix=prefix, query_mode=True, **kwargs)
+
+ def encode_documents(
+ self, documents: str | list[str], prefix: str | None = None, **kwargs
+ ) -> np.ndarray | torch.Tensor:
+ """
+ Convenience method to encode documents using document mode.
+
+ Args:
+ documents: Document text(s) to encode
+ prefix: Prefix to add
+ **kwargs: Additional arguments
+
+ Returns:
+ Document embeddings
+ """
+ return self.encode(documents, prefix=prefix, query_mode=False, **kwargs)
+
+ def set_query_mode(self, query_mode: bool = True) -> None:
+ """
+ Set the default encoding mode.
+
+ Args:
+ query_mode: True for query mode, False for document mode
+ """
+ self.query_mode = query_mode
+ logger.info(f"Set default encoding mode to {'query' if query_mode else 'document'}")
+
+ def reset_max_seq_length(self) -> None:
+ """Reset max sequence length to model's original value."""
+ if hasattr(self, "_orig_max_length") and self._orig_max_length:
+ self.max_seq_length = self._orig_max_length
+ logger.info(f"Reset max_seq_length to {self._orig_max_length}")
+ else:
+ logger.warning("Failed to reset max_seq_length - original value not available")
diff --git a/src/jmteb/embedders/sbert_embedder.py b/src/jmteb/embedders/sbert_embedder.py
index ba33a36..892f703 100644
--- a/src/jmteb/embedders/sbert_embedder.py
+++ b/src/jmteb/embedders/sbert_embedder.py
@@ -1,6 +1,7 @@
from __future__ import annotations
import numpy as np
+from loguru import logger
from sentence_transformers import SentenceTransformer
from jmteb.embedders.base import TextEmbedder
@@ -29,6 +30,7 @@ def __init__(
model_kwargs=model_kwargs, # https://github.com/UKPLab/sentence-transformers/blob/84f69fee6dcde023f46a8807e89bc99a7700ba82/sentence_transformers/SentenceTransformer.py#L81-L105 # noqa: E501
tokenizer_kwargs=tokenizer_kwargs,
)
+ self._orig_max_length = self.model.max_seq_length
if max_seq_length:
self.model.max_seq_length = max_seq_length
@@ -70,3 +72,10 @@ def _add_eos_func(self, text: str | list[str]) -> str | list[str]:
def get_output_dim(self) -> int:
return self.model.get_sentence_embedding_dimension()
+
+ def reset_max_seq_length(self):
+ try:
+ logger.info(f"Reset `max_seq_length` to {self._orig_max_length}")
+ self.model.max_seq_length = self._orig_max_length
+ except AttributeError:
+ pass
diff --git a/src/jmteb/embedders/transformers_embedder.py b/src/jmteb/embedders/transformers_embedder.py
index 0592061..721e0c9 100644
--- a/src/jmteb/embedders/transformers_embedder.py
+++ b/src/jmteb/embedders/transformers_embedder.py
@@ -48,6 +48,7 @@ def __init__(
logger.info(f"{self.model.device=}, {torch.cuda.device_count()=}")
self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
+ self._orig_max_length = getattr(self.model, "max_seq_length", None)
self.max_seq_length = getattr(self.model, "max_seq_length", None)
if max_seq_length:
self.max_seq_length = max_seq_length
@@ -135,7 +136,9 @@ def _encode_batch(self, text: list[str], prefix: str | None = None) -> torch.Ten
if self.add_eos:
text = self._add_eos_func(text)
- encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(self.model.device)
+ encoded_input = self.tokenizer(
+ text, padding=True, truncation=True, return_tensors="pt", max_length=self.max_seq_length
+ ).to(self.model.device)
model_output = self.model(**encoded_input)
last_hidden_states = model_output["last_hidden_state"]
features = {
diff --git a/src/jmteb/evaluators/classification/evaluator.py b/src/jmteb/evaluators/classification/evaluator.py
index c2b8836..bb3a4ca 100644
--- a/src/jmteb/evaluators/classification/evaluator.py
+++ b/src/jmteb/evaluators/classification/evaluator.py
@@ -66,13 +66,22 @@ def __call__(
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ encode_kwargs = self.encode_kwargs.copy()
+
+ # Check if this is a PlamoEmbedder and set optimal encoding mode
+ if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"):
+ if "query_mode" not in encode_kwargs:
+ encode_kwargs["query_mode"] = False # Use document mode for classification texts
+ logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for classification texts")
+
logger.info("Encoding training and validation sentences...")
X_train = model.batch_encode_with_cache(
[item.text for item in self.train_dataset],
prefix=self.prefix,
cache_path=Path(cache_dir) / "train_embeddings.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
y_train = [item.label for item in self.train_dataset]
@@ -81,7 +90,7 @@ def __call__(
prefix=self.prefix,
cache_path=Path(cache_dir) / "val_embeddings.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
y_val = [item.label for item in self.val_dataset]
@@ -95,7 +104,7 @@ def __call__(
prefix=self.prefix,
cache_path=Path(cache_dir) / "test_embeddings.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
y_test = [item.label for item in self.test_dataset]
diff --git a/src/jmteb/evaluators/clustering/evaluator.py b/src/jmteb/evaluators/clustering/evaluator.py
index 2b8cdf2..bbce269 100644
--- a/src/jmteb/evaluators/clustering/evaluator.py
+++ b/src/jmteb/evaluators/clustering/evaluator.py
@@ -14,6 +14,7 @@
MiniBatchKMeans,
)
from sklearn.metrics import homogeneity_completeness_v_measure
+from sklearn.preprocessing import normalize
from jmteb.embedders.base import TextEmbedder
from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults
@@ -57,13 +58,22 @@ def __call__(
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ encode_kwargs = self.encode_kwargs.copy()
+
+ # Check if this is a PlamoEmbedder and set optimal encoding mode
+ if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"):
+ if "query_mode" not in encode_kwargs:
+ encode_kwargs["query_mode"] = False # Use document mode for clustering texts
+ logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for clustering texts")
+
logger.info("Converting validation data to embeddings...")
val_embeddings = model.batch_encode_with_cache(
[item.text for item in self.val_dataset],
prefix=self.prefix,
cache_path=Path(cache_dir) / "val_embeddings.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
val_labels = [item.label for item in self.val_dataset]
@@ -77,7 +87,7 @@ def __call__(
prefix=self.prefix,
cache_path=Path(cache_dir) / "test_embeddings.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
test_labels = [item.label for item in self.test_dataset]
@@ -127,7 +137,19 @@ def __call__(
def _evaluate_clustering_model(
embeddings: np.ndarray, y_true: list[int], clustering_model: ClusterMixin
) -> tuple[dict[str, float], list[int]]:
- y_pred = clustering_model.fit_predict(embeddings)
+ try:
+ # First try without normalization to preserve original behavior when possible
+ y_pred = clustering_model.fit_predict(embeddings)
+ except ValueError as e:
+ # If overflow error occurs, apply normalization and retry
+ if "infinity" in str(e).lower() or "too large" in str(e).lower():
+ logger.warning(f"Overflow detected in clustering, applying L2 normalization: {e}")
+ embeddings_normalized = normalize(embeddings, norm="l2")
+ y_pred = clustering_model.fit_predict(embeddings_normalized)
+ else:
+ # Re-raise if it's a different ValueError
+ raise e
+
h_score, c_score, v_score = homogeneity_completeness_v_measure(
labels_pred=y_pred, labels_true=np.array(y_true)
)
diff --git a/src/jmteb/evaluators/pair_classification/evaluator.py b/src/jmteb/evaluators/pair_classification/evaluator.py
index ef466bf..8fba017 100644
--- a/src/jmteb/evaluators/pair_classification/evaluator.py
+++ b/src/jmteb/evaluators/pair_classification/evaluator.py
@@ -49,8 +49,19 @@ def __call__(
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ encode_kwargs = self.encode_kwargs.copy()
+
+ # Check if this is a PlamoEmbedder and set optimal encoding mode
+ if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"):
+ if "query_mode" not in encode_kwargs:
+ encode_kwargs["query_mode"] = False # Use document mode for pair classification texts
+ from loguru import logger
+
+ logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for pair classification texts")
+
val_embeddings1, val_embeddings2, val_golden_labels = self._convert_to_embeddings(
- model, self.val_dataset, "dev", overwrite_cache, cache_dir
+ model, self.val_dataset, "dev", overwrite_cache, cache_dir, encode_kwargs
)
if self.val_dataset == self.test_dataset:
test_embeddings1, test_embeddings2, test_golden_labels = (
@@ -60,7 +71,7 @@ def __call__(
)
else:
test_embeddings1, test_embeddings2, test_golden_labels = self._convert_to_embeddings(
- model, self.test_dataset, "test", overwrite_cache, cache_dir
+ model, self.test_dataset, "test", overwrite_cache, cache_dir, encode_kwargs
)
val_results = {}
@@ -119,20 +130,24 @@ def _convert_to_embeddings(
split: str = "test",
overwrite_cache: bool = False,
cache_dir: str | None = None,
+ encode_kwargs: dict | None = None,
) -> tuple[np.ndarray, np.ndarray, list[float]]:
+ if encode_kwargs is None:
+ encode_kwargs = self.encode_kwargs
+
embeddings1 = model.batch_encode_with_cache(
[item.sentence1 for item in dataset],
prefix=self.sentence1_prefix,
cache_path=Path(cache_dir) / f"{split}_embeddings1.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
embeddings2 = model.batch_encode_with_cache(
[item.sentence2 for item in dataset],
prefix=self.sentence2_prefix,
cache_path=Path(cache_dir) / f"{split}_embeddings2.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
golden_labels = [item.label for item in dataset]
return embeddings1, embeddings2, golden_labels
diff --git a/src/jmteb/evaluators/reranking/evaluator.py b/src/jmteb/evaluators/reranking/evaluator.py
index 144ed36..0d1be95 100644
--- a/src/jmteb/evaluators/reranking/evaluator.py
+++ b/src/jmteb/evaluators/reranking/evaluator.py
@@ -38,6 +38,8 @@ class RerankingEvaluator(EmbeddingEvaluator):
query_prefix (str | None): prefix for queries. Defaults to None.
doc_prefix (str | None): prefix for documents. Defaults to None.
log_predictions (bool): whether to log predictions of each datapoint. Defaults to False.
+ force_max_length (bool): whether to overwrite the global max_length with model's maximum token length.
+ Defaults to False.
top_n_docs_to_log (int): log only top n documents. Defaults to 5.
query_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding queries. Defaults to {}.
doc_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding documents. Defaults to {}.
@@ -53,6 +55,7 @@ def __init__(
doc_prefix: str | None = None,
log_predictions: bool = False,
top_n_docs_to_log: int = 5,
+ force_max_length: bool = False,
query_encode_kwargs: dict = {},
doc_encode_kwargs: dict = {},
) -> None:
@@ -65,6 +68,7 @@ def __init__(
self.doc_prefix = doc_prefix
self.log_predictions = log_predictions
self.top_n_docs_to_log = top_n_docs_to_log
+ self.force_max_length = force_max_length
self.query_encode_kwargs = query_encode_kwargs
self.doc_encode_kwargs = doc_encode_kwargs
@@ -75,15 +79,33 @@ def __call__(
overwrite_cache: bool = False,
) -> EvaluationResults:
model.set_output_tensor()
+ if self.force_max_length:
+ model.reset_max_seq_length()
+
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ query_kwargs = self.query_encode_kwargs.copy()
+ doc_kwargs = self.doc_encode_kwargs.copy()
+
+ # Check if this is a PlamoEmbedder and set optimal encoding modes
+ if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"):
+ if "query_mode" not in query_kwargs:
+ query_kwargs["query_mode"] = True # Use query mode for queries
+ if "query_mode" not in doc_kwargs:
+ doc_kwargs["query_mode"] = False # Use document mode for docs
+ logger.info(
+ f"Auto-optimized {model.__class__.__name__}: query_mode=True for queries,"
+ "query_mode=False for documents"
+ )
+
val_query_embeddings = model.batch_encode_with_cache(
text_list=[item.query for item in self.val_query_dataset],
prefix=self.query_prefix,
cache_path=Path(cache_dir) / "val_query.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.query_encode_kwargs,
+ **query_kwargs,
)
if self.val_query_dataset == self.test_query_dataset:
test_query_embeddings = val_query_embeddings
@@ -93,14 +115,14 @@ def __call__(
prefix=self.query_prefix,
cache_path=Path(cache_dir) / "test_query.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.query_encode_kwargs,
+ **query_kwargs,
)
doc_embeddings = model.batch_encode_with_cache(
text_list=[item.text for item in self.doc_dataset],
prefix=self.doc_prefix,
cache_path=Path(cache_dir) / "corpus.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.doc_encode_kwargs,
+ **doc_kwargs,
)
logger.info("Start reranking")
@@ -211,8 +233,6 @@ def _format_predictions(
pred_docs: list[RerankingDoc] = [
doc_dataset[doc_dataset.docid_to_idx[pred_docid]] for pred_docid in pred_docids
]
- logger.info(f"{golden_docs=}")
- logger.info(f"{pred_docs=}")
prediction = RerankingPrediction(
query=q.query,
relevant_docs=golden_docs,
diff --git a/src/jmteb/evaluators/retrieval/evaluator.py b/src/jmteb/evaluators/retrieval/evaluator.py
index 2fd6a21..fc7476e 100644
--- a/src/jmteb/evaluators/retrieval/evaluator.py
+++ b/src/jmteb/evaluators/retrieval/evaluator.py
@@ -41,6 +41,8 @@ class RetrievalEvaluator(EmbeddingEvaluator):
query_prefix (str | None): prefix for queries. Defaults to None.
doc_prefix (str | None): prefix for documents. Defaults to None.
log_predictions (bool): whether to log predictions of each datapoint. Defaults to False.
+ force_max_length (bool): whether to overwrite the global max_length with model's maximum token length.
+ Defaults to False.
top_n_docs_to_log (int): log only top n documents that are predicted as relevant. Defaults to 5.
query_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding queries. Defaults to {}.
doc_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding documents. Defaults to {}.
@@ -58,6 +60,7 @@ def __init__(
doc_prefix: str | None = None,
log_predictions: bool = False,
top_n_docs_to_log: int = 5,
+ force_max_length: bool = False,
query_encode_kwargs: dict = {},
doc_encode_kwargs: dict = {},
) -> None:
@@ -67,7 +70,7 @@ def __init__(
self.doc_chunk_size = doc_chunk_size
- self.accuracy_at_k = accuracy_at_k or [1, 3, 5, 10]
+ self.accuracy_at_k = accuracy_at_k or [1, 3, 5, 10, 20, 30, 50]
self.ndcg_at_k = ndcg_at_k or [10]
self.max_top_k = max(sum([self.accuracy_at_k, self.ndcg_at_k], []))
self.main_metric = f"ndcg@{self.ndcg_at_k[0]}"
@@ -76,6 +79,7 @@ def __init__(
self.doc_prefix = doc_prefix
self.log_predictions = log_predictions
self.top_n_docs_to_log = top_n_docs_to_log
+ self.force_max_length = force_max_length
self.query_encode_kwargs = query_encode_kwargs
self.doc_encode_kwargs = doc_encode_kwargs
@@ -86,15 +90,32 @@ def __call__(
overwrite_cache: bool = False,
) -> EvaluationResults:
model.set_output_tensor()
+ if self.force_max_length:
+ model.reset_max_seq_length()
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ query_kwargs = self.query_encode_kwargs.copy()
+ doc_kwargs = self.doc_encode_kwargs.copy()
+
+ # Check if this is a PlamoEmbedder and set optimal encoding modes
+ if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"):
+ if "query_mode" not in query_kwargs:
+ query_kwargs["query_mode"] = True # Use query mode for queries
+ if "query_mode" not in doc_kwargs:
+ doc_kwargs["query_mode"] = False # Use document mode for docs
+ logger.info(
+ f"Auto-optimized {model.__class__.__name__}: query_mode=True for queries,"
+ "query_mode=False for documents"
+ )
+
val_query_embeddings = model.batch_encode_with_cache(
text_list=[item.query for item in self.val_query_dataset],
prefix=self.query_prefix,
cache_path=Path(cache_dir) / "val_query.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.query_encode_kwargs,
+ **query_kwargs,
)
if self.val_query_dataset == self.test_query_dataset:
test_query_embeddings = val_query_embeddings
@@ -104,7 +125,7 @@ def __call__(
prefix=self.query_prefix,
cache_path=Path(cache_dir) / "test_query.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.query_encode_kwargs,
+ **query_kwargs,
)
doc_embeddings = model.batch_encode_with_cache(
@@ -112,7 +133,7 @@ def __call__(
prefix=self.doc_prefix,
cache_path=Path(cache_dir) / "corpus.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.doc_encode_kwargs,
+ **doc_kwargs,
)
logger.info("Start retrieval")
diff --git a/src/jmteb/evaluators/sts/evaluator.py b/src/jmteb/evaluators/sts/evaluator.py
index 380ceea..f2cbd0c 100644
--- a/src/jmteb/evaluators/sts/evaluator.py
+++ b/src/jmteb/evaluators/sts/evaluator.py
@@ -8,6 +8,7 @@
import numpy as np
import torch
+from loguru import logger
from scipy.stats import pearsonr, spearmanr
from torch import Tensor
@@ -52,8 +53,17 @@ def __call__(
if cache_dir is not None:
Path(cache_dir).mkdir(parents=True, exist_ok=True)
+ # Auto-optimize for PlamoEmbedder if no explicit kwargs provided
+ encode_kwargs = self.encode_kwargs.copy()
+
+ # # Check if this is a PlamoEmbedder and set optimal encoding mode
+ # if model.__class__.__name__ == "PlamoEmbedder":
+ # if "query_mode" not in encode_kwargs:
+ # encode_kwargs["query_mode"] = False # Use document mode for STS texts
+ # logger.info("Auto-optimized PlamoEmbedder: query_mode=False for STS texts")
+
val_embeddings1, val_embeddings2, val_golden_scores = self._convert_to_embeddings(
- model, self.val_dataset, "dev", overwrite_cache, cache_dir
+ model, self.val_dataset, "dev", overwrite_cache, cache_dir, encode_kwargs
)
if self.val_dataset == self.test_dataset:
test_embeddings1, test_embeddings2, test_golden_scores = (
@@ -62,7 +72,7 @@ def __call__(
val_golden_scores,
)
test_embeddings1, test_embeddings2, test_golden_scores = self._convert_to_embeddings(
- model, self.test_dataset, "test", overwrite_cache, cache_dir
+ model, self.test_dataset, "test", overwrite_cache, cache_dir, encode_kwargs
)
similarity_functions = {
@@ -146,20 +156,24 @@ def _convert_to_embeddings(
split: str = "test",
overwrite_cache: bool = False,
cache_dir: str | None = None,
+ encode_kwargs: dict | None = None,
) -> tuple[Tensor, Tensor, list[float]]:
+ if encode_kwargs is None:
+ encode_kwargs = self.encode_kwargs
+
embeddings1 = model.batch_encode_with_cache(
[item.sentence1 for item in dataset],
prefix=self.sentence1_prefix,
cache_path=Path(cache_dir) / f"{split}_embeddings1.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
embeddings2 = model.batch_encode_with_cache(
[item.sentence2 for item in dataset],
prefix=self.sentence2_prefix,
cache_path=Path(cache_dir) / f"{split}_embeddings2.bin" if cache_dir is not None else None,
overwrite_cache=overwrite_cache,
- **self.encode_kwargs,
+ **encode_kwargs,
)
device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings1 = convert_to_tensor(embeddings1, device)
diff --git a/src/jmteb/utils/score_recorder.py b/src/jmteb/utils/score_recorder.py
index afbf22c..361c809 100644
--- a/src/jmteb/utils/score_recorder.py
+++ b/src/jmteb/utils/score_recorder.py
@@ -56,8 +56,21 @@ def record_predictions(self, results: EvaluationResults, dataset_name: str, task
def record_summary(self):
if not self.save_dir:
return
- summary: dict[str, dict[str, dict[str, float]]] = defaultdict(dict)
+
+ summary_path = Path(self.save_dir) / "summary.json"
+
+ # Load existing summary if it exists
+ if summary_path.exists():
+ with open(summary_path, "r") as fin:
+ summary = json.load(fin)
+ else:
+ summary = {}
+
+ # Merge new results into existing summary
for task_name, task_scores in self.scores.items():
+ if task_name not in summary:
+ summary[task_name] = {}
for dataset_name, results in self.scores[task_name].items():
summary[task_name][dataset_name] = {results.metric_name: results.metric_value}
- self.save_to_json(summary, Path(self.save_dir) / "summary.json")
+
+ self.save_to_json(summary, summary_path)