diff --git a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json index 1b99a44..364bfee 100644 --- a/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json +++ b/docs/results/MU-Kindai/Japanese-DiffCSE-BERT-base/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7809527709426081 + "macro_f1": 0.7769528027441275 }, "amazon_review_classification": { - "macro_f1": 0.5155899232320224 + "macro_f1": 0.5146406875677701 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8844781754440035 }, "massive_intent_classification": { - "macro_f1": 0.7879373479249787 + "macro_f1": 0.7872353730798753 }, "massive_scenario_classification": { - "macro_f1": 0.8662625888023707 + "macro_f1": 0.8639715373498098 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8350488266987821 + }, + "wrime_classification": { + "macro_f1": 0.3815230965003785 } }, "Reranking": { "esci": { - "ndcg@10": 0.9095168116460639 + "ndcg@10": 0.909518320556229 + }, + "jacwir_reranking": { + "ndcg@10": 0.5981293078380808 + }, + "jqara": { + "ndcg@10": 0.3719557553111225 + }, + "miracl_reranking": { + "ndcg@10": 0.6789908587925922 + }, + "mldr_reranking": { + "ndcg@10": 0.8281088898171538 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.4085978545476503 + }, "jagovfaqs_22k": { - "ndcg@10": 0.42314124780036416 + "ndcg@10": 0.43879890119990833 }, "jaqket": { - "ndcg@10": 0.36199154051747723 + "ndcg@10": 0.3555985699236658 + }, + "mintaka_retrieval": { + "ndcg@10": 0.1997740482697841 + }, + "miracl_retrieval": { + "ndcg@10": 0.16521386136598404 + }, + "mldr_retrieval": { + "ndcg@10": 0.12060735418211223 }, "mrtydi": { - "ndcg@10": 0.07810683176415421 + "ndcg@10": 0.07107405961190999 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.5430415601583998 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.6077212544951452 + "ndcg@10": 0.5585881454407594 }, "nlp_journal_title_abs": { - "ndcg@10": 0.6433890489201118 + "ndcg@10": 0.629620778788499 }, "nlp_journal_title_intro": { - "ndcg@10": 0.39317174536190913 + "ndcg@10": 0.3517328767423871 } }, "STS": { "jsick": { - "spearman": 0.754165277432144 + "spearman": 0.7775668305928584 }, "jsts": { - "spearman": 0.7558202366183716 + "spearman": 0.7563460117163054 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.4966545453348478 + "v_measure_score": 0.4601335671191492 }, "mewsc16": { - "v_measure_score": 0.3877356318022785 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6237623762376237 + "v_measure_score": 0.39000718680465274 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3456006554316726 } } } \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json index ea227c2..20150c2 100644 --- a/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json +++ b/docs/results/MU-Kindai/Japanese-MixCSE-BERT-base/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.776174162517931 + "macro_f1": 0.7779156199278396 }, "amazon_review_classification": { - "macro_f1": 0.5085781180553806 + "macro_f1": 0.5111451768867725 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8782111274457993 }, "massive_intent_classification": { - "macro_f1": 0.7718541530739129 + "macro_f1": 0.7796973463634825 }, "massive_scenario_classification": { - "macro_f1": 0.8592571786794985 + "macro_f1": 0.8634142669499835 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8506408877596591 + }, + "wrime_classification": { + "macro_f1": 0.3656175961601361 } }, "Reranking": { "esci": { - "ndcg@10": 0.9100551950168166 + "ndcg@10": 0.9092446252246911 + }, + "jacwir_reranking": { + "ndcg@10": 0.605113846464576 + }, + "jqara": { + "ndcg@10": 0.36840730960684165 + }, + "miracl_reranking": { + "ndcg@10": 0.693114284522583 + }, + "mldr_reranking": { + "ndcg@10": 0.8530771666734125 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.42431895793525753 + }, "jagovfaqs_22k": { - "ndcg@10": 0.42368135774043536 + "ndcg@10": 0.43601956332213093 }, "jaqket": { - "ndcg@10": 0.37721850397542034 + "ndcg@10": 0.37354035206874886 + }, + "mintaka_retrieval": { + "ndcg@10": 0.2518443007449429 + }, + "miracl_retrieval": { + "ndcg@10": 0.14756204576714857 + }, + "mldr_retrieval": { + "ndcg@10": 0.16862391555076126 }, "mrtydi": { - "ndcg@10": 0.07878085186566607 + "ndcg@10": 0.07770347901718931 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.5689006657309228 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.636999375405723 + "ndcg@10": 0.5911474254499767 }, "nlp_journal_title_abs": { - "ndcg@10": 0.6413498649875696 + "ndcg@10": 0.618101892252404 }, "nlp_journal_title_intro": { - "ndcg@10": 0.397250919496823 + "ndcg@10": 0.3287673013916751 } }, "STS": { "jsick": { - "spearman": 0.7756925231422259 + "spearman": 0.7893346270810556 }, "jsts": { - "spearman": 0.7652968548841591 + "spearman": 0.7657111966582518 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5262387436934941 + "v_measure_score": 0.4498663842342549 }, "mewsc16": { - "v_measure_score": 0.37277574537292835 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.623321554770318 + "v_measure_score": 0.4319848997472401 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3860004176729398 } } } \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json index dbed068..ebc1037 100644 --- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-sup/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7619809437515043 + "macro_f1": 0.7430232193667698 }, "amazon_review_classification": { - "macro_f1": 0.5205592432502059 + "macro_f1": 0.5196833867285527 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8969457721352727 }, "massive_intent_classification": { - "macro_f1": 0.7789367871593064 + "macro_f1": 0.7782504182162112 }, "massive_scenario_classification": { - "macro_f1": 0.8490320705866646 + "macro_f1": 0.8459551634050977 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8382321236746973 + }, + "wrime_classification": { + "macro_f1": 0.3814631725334783 } }, "Reranking": { "esci": { - "ndcg@10": 0.9065584234991577 + "ndcg@10": 0.906706098295787 + }, + "jacwir_reranking": { + "ndcg@10": 0.581551030502223 + }, + "jqara": { + "ndcg@10": 0.3666097794082717 + }, + "miracl_reranking": { + "ndcg@10": 0.6908907697836885 + }, + "mldr_reranking": { + "ndcg@10": 0.8615323536010276 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.39917758524262303 + }, "jagovfaqs_22k": { - "ndcg@10": 0.4411487123884245 + "ndcg@10": 0.4460371569059824 }, "jaqket": { - "ndcg@10": 0.39613283459361814 + "ndcg@10": 0.3845053301501902 + }, + "mintaka_retrieval": { + "ndcg@10": 0.2239147895010841 + }, + "miracl_retrieval": { + "ndcg@10": 0.13942471586306499 + }, + "mldr_retrieval": { + "ndcg@10": 0.139069576010256 }, "mrtydi": { - "ndcg@10": 0.08154879873415645 + "ndcg@10": 0.07299085059942924 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.5835049460335981 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.6276035246534508 + "ndcg@10": 0.5863133806218087 }, "nlp_journal_title_abs": { - "ndcg@10": 0.5838785018803183 + "ndcg@10": 0.5743459511193183 }, "nlp_journal_title_intro": { - "ndcg@10": 0.3489329387182086 + "ndcg@10": 0.32465205260710006 } }, "STS": { "jsick": { - "spearman": 0.7463567093877269 + "spearman": 0.7525289500265361 }, "jsts": { - "spearman": 0.7468283806971927 + "spearman": 0.7466329702466956 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.41041888940251137 + "v_measure_score": 0.45840176801621957 }, "mewsc16": { - "v_measure_score": 0.45175891401665724 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6236711552090717 + "v_measure_score": 0.4407932537977668 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.38669286929581886 } } } \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json index 9528312..e1c3e9c 100644 --- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-base-unsup/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7619809437515043 + "macro_f1": 0.7640029182013914 }, "amazon_review_classification": { - "macro_f1": 0.5152108946679324 + "macro_f1": 0.5165133824101508 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8785996540635361 }, "massive_intent_classification": { - "macro_f1": 0.7895128475562229 + "macro_f1": 0.7815141648175687 }, "massive_scenario_classification": { - "macro_f1": 0.865430249169577 + "macro_f1": 0.8643739735863134 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8179797886754027 + }, + "wrime_classification": { + "macro_f1": 0.37929751450328747 } }, "Reranking": { "esci": { - "ndcg@10": 0.9115815294581953 + "ndcg@10": 0.9116742957456255 + }, + "jacwir_reranking": { + "ndcg@10": 0.6540921936468603 + }, + "jqara": { + "ndcg@10": 0.3839109493881204 + }, + "miracl_reranking": { + "ndcg@10": 0.7018821974047713 + }, + "mldr_reranking": { + "ndcg@10": 0.8442037101394532 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.4895140949755706 + }, "jagovfaqs_22k": { - "ndcg@10": 0.47387768939865055 + "ndcg@10": 0.48413330907538854 }, "jaqket": { - "ndcg@10": 0.3956683977353904 + "ndcg@10": 0.3872950509227257 + }, + "mintaka_retrieval": { + "ndcg@10": 0.25723625707011927 + }, + "miracl_retrieval": { + "ndcg@10": 0.2159968215066114 + }, + "mldr_retrieval": { + "ndcg@10": 0.18105368261359917 }, "mrtydi": { - "ndcg@10": 0.1144234568266308 + "ndcg@10": 0.11016096912346693 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.5890880676571459 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.6416096544574569 + "ndcg@10": 0.6005134171957127 }, "nlp_journal_title_abs": { - "ndcg@10": 0.7023477497744102 + "ndcg@10": 0.691482229451667 }, "nlp_journal_title_intro": { - "ndcg@10": 0.4536720868647063 + "ndcg@10": 0.377200379602747 } }, "STS": { "jsick": { - "spearman": 0.781770693640686 + "spearman": 0.7914302448138066 }, "jsts": { - "spearman": 0.7680617109850311 + "spearman": 0.7677275529386515 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5301620892693397 + "v_measure_score": 0.4272210847614043 }, "mewsc16": { - "v_measure_score": 0.4034776723308173 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6238078417520311 + "v_measure_score": 0.39391604411456593 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.2641681900458691 } } } \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json index b36686c..dad1d0c 100644 --- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-sup/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7725250131648236 + "macro_f1": 0.7767065011282246 }, "amazon_review_classification": { - "macro_f1": 0.5341627023771393 + "macro_f1": 0.5348080733659045 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8928165629175933 }, "massive_intent_classification": { - "macro_f1": 0.7682863192709365 + "macro_f1": 0.7678594675802368 }, "massive_scenario_classification": { - "macro_f1": 0.8639396658321546 + "macro_f1": 0.8624414954250645 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8376983111767246 + }, + "wrime_classification": { + "macro_f1": 0.4088843388537483 } }, "Reranking": { "esci": { - "ndcg@10": 0.9094717381883379 + "ndcg@10": 0.9093431066849924 + }, + "jacwir_reranking": { + "ndcg@10": 0.6144762455614383 + }, + "jqara": { + "ndcg@10": 0.42466871751866847 + }, + "miracl_reranking": { + "ndcg@10": 0.7065312090166875 + }, + "mldr_reranking": { + "ndcg@10": 0.8742363417086798 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.4627911424268102 + }, "jagovfaqs_22k": { - "ndcg@10": 0.47038430326303626 + "ndcg@10": 0.4824617060944974 }, "jaqket": { - "ndcg@10": 0.44101304795602897 + "ndcg@10": 0.4416882664197474 + }, + "mintaka_retrieval": { + "ndcg@10": 0.28888654887615833 + }, + "miracl_retrieval": { + "ndcg@10": 0.1951539369285861 + }, + "mldr_retrieval": { + "ndcg@10": 0.18656064853165188 }, "mrtydi": { - "ndcg@10": 0.11429128335865787 + "ndcg@10": 0.11438786651077741 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.439694854198857 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.43434267808785576 + "ndcg@10": 0.40326645532241284 }, "nlp_journal_title_abs": { - "ndcg@10": 0.6240651697600803 + "ndcg@10": 0.6048895627840009 }, "nlp_journal_title_intro": { - "ndcg@10": 0.3651687833824759 + "ndcg@10": 0.36508949429446635 } }, "STS": { "jsick": { - "spearman": 0.787528927058734 + "spearman": 0.7876474308902304 }, "jsts": { - "spearman": 0.7781413957931619 + "spearman": 0.7782114794698556 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.48448646364489634 + "v_measure_score": 0.5129910499369752 }, "mewsc16": { - "v_measure_score": 0.43168522818790694 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6235418875927891 + "v_measure_score": 0.46267377071476495 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3603960521680572 } } } \ No newline at end of file diff --git a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json index f620d50..cad831e 100644 --- a/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json +++ b/docs/results/MU-Kindai/Japanese-SimCSE-BERT-large-unsup/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7635642561809131 + "macro_f1": 0.7655145272700131 }, "amazon_review_classification": { - "macro_f1": 0.5275222511867922 + "macro_f1": 0.5273281594091623 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8821782850442395 }, "massive_intent_classification": { - "macro_f1": 0.7688060073049678 + "macro_f1": 0.772169445045981 }, "massive_scenario_classification": { - "macro_f1": 0.8651446837233107 + "macro_f1": 0.8625146467158739 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8145447793317748 + }, + "wrime_classification": { + "macro_f1": 0.40382215327142257 } }, "Reranking": { "esci": { - "ndcg@10": 0.9129851570116734 + "ndcg@10": 0.9130235242422614 + }, + "jacwir_reranking": { + "ndcg@10": 0.6513884390883999 + }, + "jqara": { + "ndcg@10": 0.44959095699445484 + }, + "miracl_reranking": { + "ndcg@10": 0.7121442551193732 + }, + "mldr_reranking": { + "ndcg@10": 0.8679395106334268 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.5316167737103407 + }, "jagovfaqs_22k": { - "ndcg@10": 0.5014367709991477 + "ndcg@10": 0.5120263378587457 }, "jaqket": { - "ndcg@10": 0.4583812630740073 + "ndcg@10": 0.45810454318653493 + }, + "mintaka_retrieval": { + "ndcg@10": 0.30420713299186014 + }, + "miracl_retrieval": { + "ndcg@10": 0.260782337674165 + }, + "mldr_retrieval": { + "ndcg@10": 0.23652695166828322 }, "mrtydi": { - "ndcg@10": 0.13003320802922363 + "ndcg@10": 0.1306190778426387 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.5464834936384055 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.5508587506679636 + "ndcg@10": 0.5213267121181618 }, "nlp_journal_title_abs": { - "ndcg@10": 0.7497069192695408 + "ndcg@10": 0.7412764112062588 }, "nlp_journal_title_intro": { - "ndcg@10": 0.4524300499843447 + "ndcg@10": 0.4220927003134505 } }, "STS": { "jsick": { - "spearman": 0.7984403024596518 + "spearman": 0.7985649981589037 }, "jsts": { - "spearman": 0.7813685476201204 + "spearman": 0.7813825399856615 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5319881995988209 + "v_measure_score": 0.5491083580906443 }, "mewsc16": { - "v_measure_score": 0.4330807170988368 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6226614895870103 + "v_measure_score": 0.4267958807672512 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3178045302473092 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-base-v2/summary.json b/docs/results/cl-nagoya/ruri-base-v2/summary.json new file mode 100644 index 0000000..c090ce8 --- /dev/null +++ b/docs/results/cl-nagoya/ruri-base-v2/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7597182825660609 + }, + "amazon_review_classification": { + "macro_f1": 0.5554544939941979 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9235657959062215 + }, + "massive_intent_classification": { + "macro_f1": 0.8092593406289539 + }, + "massive_scenario_classification": { + "macro_f1": 0.8886710878440421 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8926416828413609 + }, + "wrime_classification": { + "macro_f1": 0.461674192977988 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9317155624145913 + }, + "jacwir_reranking": { + "ndcg@10": 0.8576025511447865 + }, + "jqara": { + "ndcg@10": 0.6066458919871698 + }, + "miracl_reranking": { + "ndcg@10": 0.842561072326263 + }, + "mldr_reranking": { + "ndcg@10": 0.8846847676615118 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8101096413526069 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7590325308586044 + }, + "jaqket": { + "ndcg@10": 0.5700921243106366 + }, + "mintaka_retrieval": { + "ndcg@10": 0.4417665675636218 + }, + "miracl_retrieval": { + "ndcg@10": 0.6821942595823656 + }, + "mldr_retrieval": { + "ndcg@10": 0.3773323411085737 + }, + "mrtydi": { + "ndcg@10": 0.4088554217076187 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8805294567802572 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.8973083823806287 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9696059096853805 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.789314612552914 + } + }, + "STS": { + "jsick": { + "spearman": 0.8262585834114126 + }, + "jsts": { + "spearman": 0.8343314248100878 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5437561090974637 + }, + "mewsc16": { + "v_measure_score": 0.5060934807171409 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3553392136864812 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-base/summary.json b/docs/results/cl-nagoya/ruri-base/summary.json index a7c7b05..591ccd2 100644 --- a/docs/results/cl-nagoya/ruri-base/summary.json +++ b/docs/results/cl-nagoya/ruri-base/summary.json @@ -4,59 +4,93 @@ "macro_f1": 0.7665550732749669 }, "amazon_review_classification": { - "macro_f1": 0.5575876111411316 + "macro_f1": 0.5602315794414631 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.916854859845768 }, "massive_intent_classification": { - "macro_f1": 0.8141210121425055 + "macro_f1": 0.8122217429688374 }, "massive_scenario_classification": { - "macro_f1": 0.8848812917656395 + "macro_f1": 0.8861454528496383 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8773434580133629 + }, + "wrime_classification": { + "macro_f1": 0.4546702469392619 } }, "Reranking": { "esci": { - "ndcg@10": 0.9290942178703699 + "ndcg@10": 0.9291919623555276 + }, + "jacwir_reranking": { + "ndcg@10": 0.8723926273423869 + }, + "jqara": { + "ndcg@10": 0.5415330056104515 + }, + "miracl_reranking": { + "ndcg@10": 0.7921821114257664 + }, + "mldr_reranking": { + "ndcg@10": 0.8801076117078023 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8247892121220626 + }, "jagovfaqs_22k": { - "ndcg@10": 0.7455660589538348 + "ndcg@10": 0.7550451217031677 }, "jaqket": { - "ndcg@10": 0.5012253145754781 + "ndcg@10": 0.5023277717264268 + }, + "mintaka_retrieval": { + "ndcg@10": 0.45371270319906437 + }, + "miracl_retrieval": { + "ndcg@10": 0.5488453168704391 + }, + "mldr_retrieval": { + "ndcg@10": 0.35421737773497164 }, "mrtydi": { - "ndcg@10": 0.3545113073009125 + "ndcg@10": 0.3558845666232437 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8664858820958761 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8689204088388403 + "ndcg@10": 0.8723253192804757 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9656989703684407 + "ndcg@10": 0.952690372948545 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7531306059721564 + "ndcg@10": 0.7624967518065642 } }, "STS": { "jsick": { - "spearman": 0.8231772134744029 + "spearman": 0.8232158602892652 }, "jsts": { - "spearman": 0.8342848039994751 + "spearman": 0.8343499347567392 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5427223607801758 + "v_measure_score": 0.5669485444435229 }, "mewsc16": { - "v_measure_score": 0.5404099864321413 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6237623762376238 + "v_measure_score": 0.5205022529269108 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3854934527391879 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-large-v2/summary.json b/docs/results/cl-nagoya/ruri-large-v2/summary.json new file mode 100644 index 0000000..e4a22b7 --- /dev/null +++ b/docs/results/cl-nagoya/ruri-large-v2/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7950890220234579 + }, + "amazon_review_classification": { + "macro_f1": 0.5708906806011181 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.935661827685557 + }, + "massive_intent_classification": { + "macro_f1": 0.8087242075730218 + }, + "massive_scenario_classification": { + "macro_f1": 0.8970775785938794 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8471804883814585 + }, + "wrime_classification": { + "macro_f1": 0.47233151152826275 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9321133927024134 + }, + "jacwir_reranking": { + "ndcg@10": 0.8529056816630052 + }, + "jqara": { + "ndcg@10": 0.644692559122629 + }, + "miracl_reranking": { + "ndcg@10": 0.857799148388121 + }, + "mldr_reranking": { + "ndcg@10": 0.9068464851749977 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8048616669652183 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7822527313926262 + }, + "jaqket": { + "ndcg@10": 0.6561070613824674 + }, + "mintaka_retrieval": { + "ndcg@10": 0.5040548535978852 + }, + "miracl_retrieval": { + "ndcg@10": 0.7046000072363299 + }, + "mldr_retrieval": { + "ndcg@10": 0.36969618230893564 + }, + "mrtydi": { + "ndcg@10": 0.4636780745156557 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9085158509835447 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9114732359476821 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.977434890774318 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.8232131912662143 + } + }, + "STS": { + "jsick": { + "spearman": 0.8212250726981067 + }, + "jsts": { + "spearman": 0.8424300570470996 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5562089376369613 + }, + "mewsc16": { + "v_measure_score": 0.509675337301281 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4605817648504685 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-large/summary.json b/docs/results/cl-nagoya/ruri-large/summary.json index e86c46b..2e2cead 100644 --- a/docs/results/cl-nagoya/ruri-large/summary.json +++ b/docs/results/cl-nagoya/ruri-large/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.8080806321853091 + "macro_f1": 0.7950391460082398 }, "amazon_review_classification": { - "macro_f1": 0.5680171450057119 + "macro_f1": 0.5685184036314727 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9356380708493385 }, "massive_intent_classification": { - "macro_f1": 0.8255898596881264 + "macro_f1": 0.8209962603450597 }, "massive_scenario_classification": { - "macro_f1": 0.8956410349938264 + "macro_f1": 0.9002551808707712 + }, + "sib200_japanese_classification": { + "macro_f1": 0.852564312646895 + }, + "wrime_classification": { + "macro_f1": 0.46447181564392015 } }, "Reranking": { "esci": { - "ndcg@10": 0.9298524733536755 + "ndcg@10": 0.9298778327436324 + }, + "jacwir_reranking": { + "ndcg@10": 0.8661076138203823 + }, + "jqara": { + "ndcg@10": 0.5958950681984889 + }, + "miracl_reranking": { + "ndcg@10": 0.8022791978749706 + }, + "mldr_reranking": { + "ndcg@10": 0.8690504682983363 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8169123630823522 + }, "jagovfaqs_22k": { - "ndcg@10": 0.7667506664925435 + "ndcg@10": 0.7763829985024149 }, "jaqket": { - "ndcg@10": 0.6173871224245404 + "ndcg@10": 0.617343261611166 + }, + "mintaka_retrieval": { + "ndcg@10": 0.5106450721691843 + }, + "miracl_retrieval": { + "ndcg@10": 0.5547009159538185 + }, + "mldr_retrieval": { + "ndcg@10": 0.3476835812045506 }, "mrtydi": { - "ndcg@10": 0.3803302462897418 + "ndcg@10": 0.38120908812619875 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8652992529882778 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8712459719069233 + "ndcg@10": 0.8891161860918603 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9657898747088243 + "ndcg@10": 0.9617411892426375 }, "nlp_journal_title_intro": { - "ndcg@10": 0.779665053945222 + "ndcg@10": 0.7922108957487803 } }, "STS": { "jsick": { - "spearman": 0.8199959693684533 + "spearman": 0.8199569498182433 }, "jsts": { - "spearman": 0.8426164139167538 + "spearman": 0.8426241685487486 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5139491572866559 + "v_measure_score": 0.5443732953428371 }, "mewsc16": { - "v_measure_score": 0.5225025331595674 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6228813559322034 + "v_measure_score": 0.5058998835740889 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.44757212682292163 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-small-v2/summary.json b/docs/results/cl-nagoya/ruri-small-v2/summary.json new file mode 100644 index 0000000..eec64ee --- /dev/null +++ b/docs/results/cl-nagoya/ruri-small-v2/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7767065011282246 + }, + "amazon_review_classification": { + "macro_f1": 0.5559888936165459 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8863640825159859 + }, + "massive_intent_classification": { + "macro_f1": 0.8199647165894474 + }, + "massive_scenario_classification": { + "macro_f1": 0.8816435555944846 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8156946375922746 + }, + "wrime_classification": { + "macro_f1": 0.452255956789983 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9320364061675573 + }, + "jacwir_reranking": { + "ndcg@10": 0.8818198634914105 + }, + "jqara": { + "ndcg@10": 0.5670420631375501 + }, + "miracl_reranking": { + "ndcg@10": 0.8332825788093644 + }, + "mldr_reranking": { + "ndcg@10": 0.9009377977029078 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8303842720270221 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7401670430071696 + }, + "jaqket": { + "ndcg@10": 0.6225429070303006 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3530718504041533 + }, + "miracl_retrieval": { + "ndcg@10": 0.6689773236918534 + }, + "mldr_retrieval": { + "ndcg@10": 0.32577528652704146 + }, + "mrtydi": { + "ndcg@10": 0.42400768916861914 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9064650891678154 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9041671364705328 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9729556994161748 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.7821156819492701 + } + }, + "STS": { + "jsick": { + "spearman": 0.8387675357095226 + }, + "jsts": { + "spearman": 0.8193470885317312 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5260577746749562 + }, + "mewsc16": { + "v_measure_score": 0.4947076915300828 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.47820319421479446 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-small/summary.json b/docs/results/cl-nagoya/ruri-small/summary.json index cb591ea..079db3e 100644 --- a/docs/results/cl-nagoya/ruri-small/summary.json +++ b/docs/results/cl-nagoya/ruri-small/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7991935990685706 + "macro_f1": 0.8055421233612723 }, "amazon_review_classification": { - "macro_f1": 0.556129066893332 + "macro_f1": 0.5541385299441624 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8885932202820669 }, "massive_intent_classification": { - "macro_f1": 0.8148895285345188 + "macro_f1": 0.8108237159349728 }, "massive_scenario_classification": { - "macro_f1": 0.8787774569382543 + "macro_f1": 0.8800077744996155 + }, + "sib200_japanese_classification": { + "macro_f1": 0.839667353042202 + }, + "wrime_classification": { + "macro_f1": 0.4595261443020403 } }, "Reranking": { "esci": { - "ndcg@10": 0.9300177985352138 + "ndcg@10": 0.9301438020851305 + }, + "jacwir_reranking": { + "ndcg@10": 0.8766726074179287 + }, + "jqara": { + "ndcg@10": 0.5325863556709908 + }, + "miracl_reranking": { + "ndcg@10": 0.7783787989685144 + }, + "mldr_reranking": { + "ndcg@10": 0.8813650067339368 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.825837748200516 + }, "jagovfaqs_22k": { - "ndcg@10": 0.736494039429321 + "ndcg@10": 0.740126693753929 }, "jaqket": { - "ndcg@10": 0.484437639428696 + "ndcg@10": 0.4844203596195783 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3723496207549938 + }, + "miracl_retrieval": { + "ndcg@10": 0.5222032466588368 + }, + "mldr_retrieval": { + "ndcg@10": 0.2898890422890513 }, "mrtydi": { - "ndcg@10": 0.3342716158897666 + "ndcg@10": 0.3351374258570715 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8689213841203763 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8768878489670099 + "ndcg@10": 0.8723259697162892 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9716879343439146 + "ndcg@10": 0.9619567235021281 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7608660955794895 + "ndcg@10": 0.7608782792491423 } }, "STS": { "jsick": { - "spearman": 0.8343927017558587 + "spearman": 0.8344934497771457 }, "jsts": { - "spearman": 0.8213297790184827 + "spearman": 0.8213145808052514 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5096442244018489 + "v_measure_score": 0.5289736036070719 }, "mewsc16": { - "v_measure_score": 0.5141045788711239 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6211267605633802 + "v_measure_score": 0.4936801242208388 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.46507426407220503 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-v3-130m/summary.json b/docs/results/cl-nagoya/ruri-v3-130m/summary.json new file mode 100644 index 0000000..5700f32 --- /dev/null +++ b/docs/results/cl-nagoya/ruri-v3-130m/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7674793827265536 + }, + "amazon_review_classification": { + "macro_f1": 0.5955994619477079 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9500285886600925 + }, + "massive_intent_classification": { + "macro_f1": 0.807938642045445 + }, + "massive_scenario_classification": { + "macro_f1": 0.8790346026671575 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8287806075978352 + }, + "wrime_classification": { + "macro_f1": 0.46634901067800855 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9336981049156847 + }, + "jacwir_reranking": { + "ndcg@10": 0.8864670177419038 + }, + "jqara": { + "ndcg@10": 0.663018840039673 + }, + "miracl_reranking": { + "ndcg@10": 0.865876689917921 + }, + "mldr_reranking": { + "ndcg@10": 0.9362058245511219 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8421113535976967 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7532393338902414 + }, + "jaqket": { + "ndcg@10": 0.730979460582779 + }, + "mintaka_retrieval": { + "ndcg@10": 0.5177034569356731 + }, + "miracl_retrieval": { + "ndcg@10": 0.7100959869376436 + }, + "mldr_retrieval": { + "ndcg@10": 0.45158335316076936 + }, + "mrtydi": { + "ndcg@10": 0.4780012151028164 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.995144547086835 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9887952520028016 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9795152116360624 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9628103840588119 + } + }, + "STS": { + "jsick": { + "spearman": 0.7885956280300046 + }, + "jsts": { + "spearman": 0.8323603869543141 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5436288048604071 + }, + "mewsc16": { + "v_measure_score": 0.4883532965483729 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.5019988844015973 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-v3-30m/summary.json b/docs/results/cl-nagoya/ruri-v3-30m/summary.json new file mode 100644 index 0000000..c4e768a --- /dev/null +++ b/docs/results/cl-nagoya/ruri-v3-30m/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.7559571782387728 + }, + "amazon_review_classification": { + "macro_f1": 0.5570789457429248 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9262839486939813 + }, + "massive_intent_classification": { + "macro_f1": 0.783074979041957 + }, + "massive_scenario_classification": { + "macro_f1": 0.8672396605716526 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8140481078951145 + }, + "wrime_classification": { + "macro_f1": 0.4311261750368354 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9305651903486406 + }, + "jacwir_reranking": { + "ndcg@10": 0.8761294751423317 + }, + "jqara": { + "ndcg@10": 0.5747490185208084 + }, + "miracl_reranking": { + "ndcg@10": 0.8352458113588647 + }, + "mldr_reranking": { + "ndcg@10": 0.9297421530365237 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.827028266156452 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7020872105862214 + }, + "jaqket": { + "ndcg@10": 0.6244733500896729 + }, + "mintaka_retrieval": { + "ndcg@10": 0.4304756847175998 + }, + "miracl_retrieval": { + "ndcg@10": 0.6498916988979277 + }, + "mldr_retrieval": { + "ndcg@10": 0.4577076048703079 + }, + "mrtydi": { + "ndcg@10": 0.41775750844113785 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9876046427100846 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9916030162169887 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9699245797579602 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9534027111106339 + } + }, + "STS": { + "jsick": { + "spearman": 0.8161946935797372 + }, + "jsts": { + "spearman": 0.819463211043541 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5369067977199252 + }, + "mewsc16": { + "v_measure_score": 0.47961175798341066 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4804316290090649 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-v3-310m/summary.json b/docs/results/cl-nagoya/ruri-v3-310m/summary.json new file mode 100644 index 0000000..c27fed8 --- /dev/null +++ b/docs/results/cl-nagoya/ruri-v3-310m/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.8009270010529765 + }, + "amazon_review_classification": { + "macro_f1": 0.6071898527482484 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9530657500380437 + }, + "massive_intent_classification": { + "macro_f1": 0.8176293812793415 + }, + "massive_scenario_classification": { + "macro_f1": 0.890051922198645 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8812655271153628 + }, + "wrime_classification": { + "macro_f1": 0.4852854023445756 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9342725351989479 + }, + "jacwir_reranking": { + "ndcg@10": 0.8845859005757672 + }, + "jqara": { + "ndcg@10": 0.6893206802955604 + }, + "miracl_reranking": { + "ndcg@10": 0.8500853284469898 + }, + "mldr_reranking": { + "ndcg@10": 0.9335769070370818 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8406411130636801 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7648595155366429 + }, + "jaqket": { + "ndcg@10": 0.7186721885111346 + }, + "mintaka_retrieval": { + "ndcg@10": 0.5225348075920366 + }, + "miracl_retrieval": { + "ndcg@10": 0.677145342243983 + }, + "mldr_retrieval": { + "ndcg@10": 0.43425275955863796 + }, + "mrtydi": { + "ndcg@10": 0.47064490316120666 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9958682142366949 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9935172926595653 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9790717306095701 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9658294271714906 + } + }, + "STS": { + "jsick": { + "spearman": 0.7886332339318622 + }, + "jsts": { + "spearman": 0.8430847366018317 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5855988614657296 + }, + "mewsc16": { + "v_measure_score": 0.4860478393120035 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4440626045366051 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/ruri-v3-70m/summary.json b/docs/results/cl-nagoya/ruri-v3-70m/summary.json new file mode 100644 index 0000000..3a2c52d --- /dev/null +++ b/docs/results/cl-nagoya/ruri-v3-70m/summary.json @@ -0,0 +1,96 @@ +{ + "Classification": { + "amazon_counterfactual_classification": { + "macro_f1": 0.8180877928218353 + }, + "amazon_review_classification": { + "macro_f1": 0.5798379850008339 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9339140455312027 + }, + "massive_intent_classification": { + "macro_f1": 0.7891754112354649 + }, + "massive_scenario_classification": { + "macro_f1": 0.8782518076402043 + }, + "sib200_japanese_classification": { + "macro_f1": 0.7686616284901401 + }, + "wrime_classification": { + "macro_f1": 0.4437562280187194 + } + }, + "Reranking": { + "esci": { + "ndcg@10": 0.9320237969329785 + }, + "jacwir_reranking": { + "ndcg@10": 0.8748197118530385 + }, + "jqara": { + "ndcg@10": 0.6309432249818713 + }, + "miracl_reranking": { + "ndcg@10": 0.8503057292439823 + }, + "mldr_reranking": { + "ndcg@10": 0.9225778620264797 + } + }, + "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8275893500639571 + }, + "jagovfaqs_22k": { + "ndcg@10": 0.7327144021448485 + }, + "jaqket": { + "ndcg@10": 0.6768047159335538 + }, + "mintaka_retrieval": { + "ndcg@10": 0.4626106409683068 + }, + "miracl_retrieval": { + "ndcg@10": 0.6797764462851262 + }, + "mldr_retrieval": { + "ndcg@10": 0.43554376517918675 + }, + "mrtydi": { + "ndcg@10": 0.4499999994407917 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.984966699117648 + }, + "nlp_journal_abs_intro": { + "ndcg@10": 0.9868218521221748 + }, + "nlp_journal_title_abs": { + "ndcg@10": 0.9706955197203543 + }, + "nlp_journal_title_intro": { + "ndcg@10": 0.9573354583951488 + } + }, + "STS": { + "jsick": { + "spearman": 0.7909930894957667 + }, + "jsts": { + "spearman": 0.828242284804404 + } + }, + "Clustering": { + "livedoor_news": { + "v_measure_score": 0.5492094636693866 + }, + "mewsc16": { + "v_measure_score": 0.47739615416643866 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4719940146272088 + } + } +} \ No newline at end of file diff --git a/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json index 42cc5ff..45ec65b 100644 --- a/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json +++ b/docs/results/cl-nagoya/sup-simcse-ja-base/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7234436301724776 + "macro_f1": 0.7192545517004465 }, "amazon_review_classification": { - "macro_f1": 0.5441445333270086 + "macro_f1": 0.5454422812215437 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9100588500656168 }, "massive_intent_classification": { - "macro_f1": 0.7951973953020242 + "macro_f1": 0.8011172170046241 }, "massive_scenario_classification": { - "macro_f1": 0.8760200177186923 + "macro_f1": 0.8762609424720998 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8191722798191963 + }, + "wrime_classification": { + "macro_f1": 0.4188203301151871 } }, "Reranking": { "esci": { - "ndcg@10": 0.9183455876236017 + "ndcg@10": 0.9184207070049463 + }, + "jacwir_reranking": { + "ndcg@10": 0.6426611140199804 + }, + "jqara": { + "ndcg@10": 0.3748362133870952 + }, + "miracl_reranking": { + "ndcg@10": 0.7087840971938433 + }, + "mldr_reranking": { + "ndcg@10": 0.8734013475096433 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.5331630522529377 + }, "jagovfaqs_22k": { - "ndcg@10": 0.5161990612242935 + "ndcg@10": 0.5202480516932524 }, "jaqket": { - "ndcg@10": 0.5024513438428565 + "ndcg@10": 0.5013089667314551 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3288294149496304 + }, + "miracl_retrieval": { + "ndcg@10": 0.20681341934572967 + }, + "mldr_retrieval": { + "ndcg@10": 0.24700329716018354 }, "mrtydi": { - "ndcg@10": 0.13976323269046823 + "ndcg@10": 0.141360680613414 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.6909104560170936 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.6807886421530585 + "ndcg@10": 0.6619434888289687 }, "nlp_journal_title_abs": { - "ndcg@10": 0.6570889175649209 + "ndcg@10": 0.6484407439307039 }, "nlp_journal_title_intro": { - "ndcg@10": 0.48219159577174137 + "ndcg@10": 0.4696725603511326 } }, "STS": { "jsick": { - "spearman": 0.8282816229512862 + "spearman": 0.8283659349049672 }, "jsts": { - "spearman": 0.8127259236647225 + "spearman": 0.8126484380435667 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5266774168531417 + "v_measure_score": 0.5248555489302708 }, "mewsc16": { - "v_measure_score": 0.5091016872016825 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6256665481692143 + "v_measure_score": 0.5339141639252604 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.49207894013578146 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json index a2d8924..3d0bb71 100644 --- a/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json +++ b/docs/results/cl-nagoya/sup-simcse-ja-large/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7321444865928852 + "macro_f1": 0.7260568612881779 }, "amazon_review_classification": { - "macro_f1": 0.5475800661400465 + "macro_f1": 0.5455832826466495 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8942024454984163 }, "massive_intent_classification": { - "macro_f1": 0.7922802742146243 + "macro_f1": 0.792273118014186 }, "massive_scenario_classification": { - "macro_f1": 0.8772172454209797 + "macro_f1": 0.8770657195206764 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8042709569831964 + }, + "wrime_classification": { + "macro_f1": 0.4525777476393026 } }, "Reranking": { "esci": { - "ndcg@10": 0.9148471751378899 + "ndcg@10": 0.9149640515619839 + }, + "jacwir_reranking": { + "ndcg@10": 0.5614550878114778 + }, + "jqara": { + "ndcg@10": 0.38302855218604437 + }, + "miracl_reranking": { + "ndcg@10": 0.7126433285790728 + }, + "mldr_reranking": { + "ndcg@10": 0.8659821811381412 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.4370774500135088 + }, "jagovfaqs_22k": { - "ndcg@10": 0.4683673504170269 + "ndcg@10": 0.47421467281855384 }, "jaqket": { - "ndcg@10": 0.39878189118804513 + "ndcg@10": 0.4004385277719307 + }, + "mintaka_retrieval": { + "ndcg@10": 0.376774984849213 + }, + "miracl_retrieval": { + "ndcg@10": 0.18125969161337505 + }, + "mldr_retrieval": { + "ndcg@10": 0.23480755788261093 }, "mrtydi": { - "ndcg@10": 0.11834919561027905 + "ndcg@10": 0.1188048690188868 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.6407825080386719 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.634254459552888 + "ndcg@10": 0.6295135121177772 }, "nlp_journal_title_abs": { - "ndcg@10": 0.37927566884615427 + "ndcg@10": 0.36949537039923136 }, "nlp_journal_title_intro": { - "ndcg@10": 0.25787534957423713 + "ndcg@10": 0.2490316613470849 } }, "STS": { "jsick": { - "spearman": 0.837959537101532 + "spearman": 0.8377753687267541 }, "jsts": { - "spearman": 0.825691902117111 + "spearman": 0.8256006176068381 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5074967876488787 + "v_measure_score": 0.5337915256082275 }, "mewsc16": { - "v_measure_score": 0.503782014677764 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6250885896527285 + "v_measure_score": 0.5111565926265328 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.45736658859438273 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json index 3863c9e..bae07a1 100644 --- a/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json +++ b/docs/results/cl-nagoya/unsup-simcse-ja-base/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7330185800774036 + "macro_f1": 0.7364790582283407 }, "amazon_review_classification": { - "macro_f1": 0.5392887528271114 + "macro_f1": 0.5413541626836352 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8986588956343088 }, "massive_intent_classification": { - "macro_f1": 0.7907120296283751 + "macro_f1": 0.7767897385750657 }, "massive_scenario_classification": { - "macro_f1": 0.8597097942715117 + "macro_f1": 0.8610390686035142 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8413013579577491 + }, + "wrime_classification": { + "macro_f1": 0.41309966752995253 } }, "Reranking": { "esci": { - "ndcg@10": 0.9115668272308735 + "ndcg@10": 0.9117818311636607 + }, + "jacwir_reranking": { + "ndcg@10": 0.5154239181007129 + }, + "jqara": { + "ndcg@10": 0.3218696921394324 + }, + "miracl_reranking": { + "ndcg@10": 0.6995597032253587 + }, + "mldr_reranking": { + "ndcg@10": 0.8612256071032377 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.35106925427500363 + }, "jagovfaqs_22k": { - "ndcg@10": 0.46003459081522513 + "ndcg@10": 0.4673719618749888 }, "jaqket": { - "ndcg@10": 0.3945725593125862 + "ndcg@10": 0.3951670829019162 + }, + "mintaka_retrieval": { + "ndcg@10": 0.299231152726057 + }, + "miracl_retrieval": { + "ndcg@10": 0.10934136213023636 + }, + "mldr_retrieval": { + "ndcg@10": 0.15981611825721914 }, "mrtydi": { - "ndcg@10": 0.055507775092798486 + "ndcg@10": 0.055133639963568334 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.582165240647806 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.6025847751308843 + "ndcg@10": 0.5841104498413489 }, "nlp_journal_title_abs": { - "ndcg@10": 0.5562839869857912 + "ndcg@10": 0.55577879846708 }, "nlp_journal_title_intro": { - "ndcg@10": 0.3449181162324482 + "ndcg@10": 0.3284050897756761 } }, "STS": { "jsick": { - "spearman": 0.7849379492955117 + "spearman": 0.7852600594448598 }, "jsts": { - "spearman": 0.7894946592483818 + "spearman": 0.7894496424482047 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5223347838445698 + "v_measure_score": 0.4936084943071576 }, "mewsc16": { - "v_measure_score": 0.37310458219601117 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.624424778761062 + "v_measure_score": 0.3743327976467685 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3592578922316612 } } } \ No newline at end of file diff --git a/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json index d37618a..09525c9 100644 --- a/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json +++ b/docs/results/cl-nagoya/unsup-simcse-ja-large/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.767905114979583 + "macro_f1": 0.7640316468319925 }, "amazon_review_classification": { - "macro_f1": 0.5537089641846143 + "macro_f1": 0.5504736753163985 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9057099704855596 }, "massive_intent_classification": { - "macro_f1": 0.7912698845073401 + "macro_f1": 0.792495956569193 }, "massive_scenario_classification": { - "macro_f1": 0.8736185210672394 + "macro_f1": 0.8749858164207054 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8288719236604842 + }, + "wrime_classification": { + "macro_f1": 0.44326523397693174 } }, "Reranking": { "esci": { - "ndcg@10": 0.9095494729022622 + "ndcg@10": 0.9094836571513687 + }, + "jacwir_reranking": { + "ndcg@10": 0.5417192948613557 + }, + "jqara": { + "ndcg@10": 0.3877939946491903 + }, + "miracl_reranking": { + "ndcg@10": 0.7001887861606321 + }, + "mldr_reranking": { + "ndcg@10": 0.8303617273610736 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.37613574135010835 + }, "jagovfaqs_22k": { - "ndcg@10": 0.4509073581555124 + "ndcg@10": 0.46564010373437337 }, "jaqket": { - "ndcg@10": 0.34595043675331943 + "ndcg@10": 0.3452888488420233 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3058130510308383 + }, + "miracl_retrieval": { + "ndcg@10": 0.10326154138228141 + }, + "mldr_retrieval": { + "ndcg@10": 0.12550430031143336 }, "mrtydi": { - "ndcg@10": 0.05750859876901772 + "ndcg@10": 0.057502989435967655 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.504469050615059 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.550742021417855 + "ndcg@10": 0.5069650402920987 }, "nlp_journal_title_abs": { - "ndcg@10": 0.6307172007359215 + "ndcg@10": 0.6043158227609278 }, "nlp_journal_title_intro": { - "ndcg@10": 0.39612451822677164 + "ndcg@10": 0.34323430832579677 } }, "STS": { "jsick": { - "spearman": 0.8014979086154339 + "spearman": 0.8013849170804103 }, "jsts": { - "spearman": 0.8097685749017456 + "spearman": 0.809789575264219 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5090447587797094 + "v_measure_score": 0.5147732775967515 }, "mewsc16": { - "v_measure_score": 0.4591920015613856 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6248671625929861 + "v_measure_score": 0.44443267597570074 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.34646662604886447 } } } \ No newline at end of file diff --git a/docs/results/colorfulscoop/sbert-base-ja/summary.json b/docs/results/colorfulscoop/sbert-base-ja/summary.json index 2a08044..0f2bf84 100644 --- a/docs/results/colorfulscoop/sbert-base-ja/summary.json +++ b/docs/results/colorfulscoop/sbert-base-ja/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7221023294352484 + "macro_f1": 0.7080315613053877 }, "amazon_review_classification": { - "macro_f1": 0.47952384496155054 + "macro_f1": 0.4779713813897666 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8350239953633378 }, "massive_intent_classification": { - "macro_f1": 0.725195343788811 + "macro_f1": 0.7288673932703351 }, "massive_scenario_classification": { - "macro_f1": 0.836177960542408 + "macro_f1": 0.8370655127879382 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8262660922438109 + }, + "wrime_classification": { + "macro_f1": 0.35057897749310646 } }, "Reranking": { "esci": { - "ndcg@10": 0.8997301146575819 + "ndcg@10": 0.8996866702578056 + }, + "jacwir_reranking": { + "ndcg@10": 0.37147215136686634 + }, + "jqara": { + "ndcg@10": 0.2220517076242275 + }, + "miracl_reranking": { + "ndcg@10": 0.6502702968219343 + }, + "mldr_reranking": { + "ndcg@10": 0.8255483571039144 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.192984468642645 + }, "jagovfaqs_22k": { - "ndcg@10": 0.21501915127957166 + "ndcg@10": 0.21704292684612675 }, "jaqket": { - "ndcg@10": 0.13161989528541293 + "ndcg@10": 0.13139887002144995 + }, + "mintaka_retrieval": { + "ndcg@10": 0.19067862146114167 + }, + "miracl_retrieval": { + "ndcg@10": 0.018598782450328283 + }, + "mldr_retrieval": { + "ndcg@10": 0.06972936265190934 }, "mrtydi": { - "ndcg@10": 0.00436010196904899 + "ndcg@10": 0.004126228941345733 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.29023294982669573 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.2878020264605714 + "ndcg@10": 0.2580237968832312 }, "nlp_journal_title_abs": { - "ndcg@10": 0.22397059858982324 + "ndcg@10": 0.21071404885072903 }, "nlp_journal_title_intro": { - "ndcg@10": 0.12815871897103842 + "ndcg@10": 0.11573741610386916 } }, "STS": { "jsick": { - "spearman": 0.6659298300713198 + "spearman": 0.6656074999372202 }, "jsts": { - "spearman": 0.7423952309826243 + "spearman": 0.7425444938991701 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.4298579019834722 + "v_measure_score": 0.4059869097583984 }, "mewsc16": { - "v_measure_score": 0.46641671645082333 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6231013776050865 + "v_measure_score": 0.46242491131769853 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.3035702180528845 } } } \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-base/summary.json b/docs/results/intfloat/multilingual-e5-base/summary.json index 96f9640..4d84be2 100644 --- a/docs/results/intfloat/multilingual-e5-base/summary.json +++ b/docs/results/intfloat/multilingual-e5-base/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.6367079139150691 + "macro_f1": 0.6428957534047911 }, "amazon_review_classification": { - "macro_f1": 0.5424265794470897 + "macro_f1": 0.5417258327796466 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9231910434886872 }, "massive_intent_classification": { - "macro_f1": 0.7277503514873049 + "macro_f1": 0.7318717264077053 }, "massive_scenario_classification": { - "macro_f1": 0.8652828949015864 + "macro_f1": 0.8677940980663801 + }, + "sib200_japanese_classification": { + "macro_f1": 0.785022714268383 + }, + "wrime_classification": { + "macro_f1": 0.3865061394465788 } }, "Reranking": { "esci": { - "ndcg@10": 0.9285060467194839 + "ndcg@10": 0.9290148108090969 + }, + "jacwir_reranking": { + "ndcg@10": 0.8865491934939191 + }, + "jqara": { + "ndcg@10": 0.4761308479065645 + }, + "miracl_reranking": { + "ndcg@10": 0.8196779545649944 + }, + "mldr_reranking": { + "ndcg@10": 0.8614612823139557 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8431602298737804 + }, "jagovfaqs_22k": { - "ndcg@10": 0.6534478396845428 + "ndcg@10": 0.687214041967885 }, "jaqket": { - "ndcg@10": 0.5067444792013236 + "ndcg@10": 0.5169392915456349 + }, + "mintaka_retrieval": { + "ndcg@10": 0.34676383987252357 + }, + "miracl_retrieval": { + "ndcg@10": 0.6449511893902589 + }, + "mldr_retrieval": { + "ndcg@10": 0.2573147838464383 }, "mrtydi": { - "ndcg@10": 0.3837652120001251 + "ndcg@10": 0.42298287793585587 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8355946539433561 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8709767034225332 + "ndcg@10": 0.8447862631398672 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9473129303429082 + "ndcg@10": 0.9461907998491789 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7304538728893641 + "ndcg@10": 0.7469571396756213 } }, "STS": { "jsick": { - "spearman": 0.8128058660848744 + "spearman": 0.8125544166626103 }, "jsts": { - "spearman": 0.7839196475937381 + "spearman": 0.7965480195299134 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5502694126615243 + "v_measure_score": 0.5379041349111564 }, "mewsc16": { - "v_measure_score": 0.41494514000218946 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6226482073127441 + "v_measure_score": 0.4943772106331262 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4713134178805946 } } } \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-large/summary.json b/docs/results/intfloat/multilingual-e5-large/summary.json index a28c470..40752a5 100644 --- a/docs/results/intfloat/multilingual-e5-large/summary.json +++ b/docs/results/intfloat/multilingual-e5-large/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.706580687830688 + "macro_f1": 0.6969861236021963 }, "amazon_review_classification": { - "macro_f1": 0.5653992303516462 + "macro_f1": 0.5763612743026115 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9554866923455646 }, "massive_intent_classification": { - "macro_f1": 0.7577710251429624 + "macro_f1": 0.7401244088033258 }, "massive_scenario_classification": { - "macro_f1": 0.8859090262583831 + "macro_f1": 0.887053685338159 + }, + "sib200_japanese_classification": { + "macro_f1": 0.7811476853348774 + }, + "wrime_classification": { + "macro_f1": 0.42377599926222737 } }, "Reranking": { "esci": { - "ndcg@10": 0.9296254722183955 + "ndcg@10": 0.9330712866652149 + }, + "jacwir_reranking": { + "ndcg@10": 0.9036816685131848 + }, + "jqara": { + "ndcg@10": 0.561374764136422 + }, + "miracl_reranking": { + "ndcg@10": 0.8631195198401651 + }, + "mldr_reranking": { + "ndcg@10": 0.8891328806594833 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8641271530674604 + }, "jagovfaqs_22k": { - "ndcg@10": 0.7030214336558751 + "ndcg@10": 0.7297746711291291 }, "jaqket": { - "ndcg@10": 0.5878065301444064 + "ndcg@10": 0.5967326588135612 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3958992445664435 + }, + "miracl_retrieval": { + "ndcg@10": 0.7095604570396511 + }, + "mldr_retrieval": { + "ndcg@10": 0.2984972238105224 }, "mrtydi": { - "ndcg@10": 0.4363167873386172 + "ndcg@10": 0.4781603349494696 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8326468852967057 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8600225120389309 + "ndcg@10": 0.8571088737195884 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9469712765040588 + "ndcg@10": 0.952870249874937 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7248023877969718 + "ndcg@10": 0.7257268520360993 } }, "STS": { "jsick": { - "spearman": 0.7840335060728089 + "spearman": 0.7985423882395024 }, "jsts": { - "spearman": 0.8098724997856234 + "spearman": 0.8186303902222064 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5713023706914878 + "v_measure_score": 0.5157643001398088 }, "mewsc16": { - "v_measure_score": 0.4534484706354193 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.621496984746364 + "v_measure_score": 0.46806674695304834 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.5334765362912619 } } } \ No newline at end of file diff --git a/docs/results/intfloat/multilingual-e5-small/summary.json b/docs/results/intfloat/multilingual-e5-small/summary.json index 99a4423..af62c84 100644 --- a/docs/results/intfloat/multilingual-e5-small/summary.json +++ b/docs/results/intfloat/multilingual-e5-small/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.6214130966524566 + "macro_f1": 0.5866005078388893 }, "amazon_review_classification": { - "macro_f1": 0.5127428912860463 + "macro_f1": 0.5120598395740691 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8773239262941632 }, "massive_intent_classification": { - "macro_f1": 0.7085230519111091 + "macro_f1": 0.7134377059258787 }, "massive_scenario_classification": { - "macro_f1": 0.8622036829599259 + "macro_f1": 0.8676947906742417 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8177503141758454 + }, + "wrime_classification": { + "macro_f1": 0.36913347435432137 } }, "Reranking": { "esci": { - "ndcg@10": 0.9303349187158247 + "ndcg@10": 0.9298402731760124 + }, + "jacwir_reranking": { + "ndcg@10": 0.8998812594907971 + }, + "jqara": { + "ndcg@10": 0.49280220404951935 + }, + "miracl_reranking": { + "ndcg@10": 0.8178461260193638 + }, + "mldr_reranking": { + "ndcg@10": 0.864145360860429 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8558160940470637 + }, "jagovfaqs_22k": { - "ndcg@10": 0.6411252958220891 + "ndcg@10": 0.6568760244912849 }, "jaqket": { - "ndcg@10": 0.49966509556428645 + "ndcg@10": 0.5157123960708363 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3153737960263929 + }, + "miracl_retrieval": { + "ndcg@10": 0.6323300168472976 + }, + "mldr_retrieval": { + "ndcg@10": 0.2590832302769219 }, "mrtydi": { - "ndcg@10": 0.36054822913647616 + "ndcg@10": 0.4236692119753354 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.8396508926780583 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.8520749151982298 + "ndcg@10": 0.8409842458346825 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9526123412781002 + "ndcg@10": 0.9447219194706624 }, "nlp_journal_title_intro": { - "ndcg@10": 0.729906931983999 + "ndcg@10": 0.7455737280382885 } }, "STS": { "jsick": { - "spearman": 0.8150271836013705 + "spearman": 0.8199946308873799 }, "jsts": { - "spearman": 0.786450077409501 + "spearman": 0.7892106647109823 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5470075389200084 + "v_measure_score": 0.5904685845799247 }, "mewsc16": { - "v_measure_score": 0.391226933590049 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6219382321618744 + "v_measure_score": 0.5233814767010047 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.43592128019411325 } } } \ No newline at end of file diff --git a/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json index 6b7309a..f9fbe6f 100644 --- a/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json +++ b/docs/results/oshizo/sbert-jsnli-luke-japanese-base-lite/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7994675369288904 + "macro_f1": 0.7972419438068292 }, "amazon_review_classification": { - "macro_f1": 0.5748206591211895 + "macro_f1": 0.575066739799988 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9207590996896718 }, "massive_intent_classification": { - "macro_f1": 0.8025949222725076 + "macro_f1": 0.8015558847211773 }, "massive_scenario_classification": { - "macro_f1": 0.8875250742566655 + "macro_f1": 0.8878291337617034 + }, + "sib200_japanese_classification": { + "macro_f1": 0.7731122315942124 + }, + "wrime_classification": { + "macro_f1": 0.4573111522822367 } }, "Reranking": { "esci": { - "ndcg@10": 0.9156331205981866 + "ndcg@10": 0.9157948249893592 + }, + "jacwir_reranking": { + "ndcg@10": 0.674104660572769 + }, + "jqara": { + "ndcg@10": 0.35765029945439447 + }, + "miracl_reranking": { + "ndcg@10": 0.68225515961945 + }, + "mldr_reranking": { + "ndcg@10": 0.8538476294446257 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.59611064776643 + }, "jagovfaqs_22k": { - "ndcg@10": 0.519938655947725 + "ndcg@10": 0.5403287346696719 }, "jaqket": { - "ndcg@10": 0.4206746951743811 + "ndcg@10": 0.42113936906002564 + }, + "mintaka_retrieval": { + "ndcg@10": 0.2482827887837841 + }, + "miracl_retrieval": { + "ndcg@10": 0.1928427319999251 + }, + "mldr_retrieval": { + "ndcg@10": 0.19084474235068657 }, "mrtydi": { - "ndcg@10": 0.10116108109776817 + "ndcg@10": 0.10090455185771262 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.44067635335327865 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.4930421996747514 + "ndcg@10": 0.44837143094362086 }, "nlp_journal_title_abs": { - "ndcg@10": 0.719369187830078 + "ndcg@10": 0.7368252250653567 }, "nlp_journal_title_intro": { - "ndcg@10": 0.3258568875005778 + "ndcg@10": 0.3115238718909808 } }, "STS": { "jsick": { - "spearman": 0.7211422898060521 + "spearman": 0.7203759702575281 }, "jsts": { - "spearman": 0.8109305772255819 + "spearman": 0.8107670759374308 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.4677177349822789 + "v_measure_score": 0.4816771908212549 }, "mewsc16": { - "v_measure_score": 0.5389209739242912 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6237623762376237 + "v_measure_score": 0.5336022487793333 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.43034104597999767 } } } \ No newline at end of file diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json index 7318aab..6d1041e 100644 --- a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json +++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7492232749031491 + "macro_f1": 0.7528271196943096 }, "amazon_review_classification": { - "macro_f1": 0.5530707609927811 + "macro_f1": 0.5518771080100612 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.892368025976312 }, "massive_intent_classification": { - "macro_f1": 0.7979144461303402 + "macro_f1": 0.7872725195473699 }, "massive_scenario_classification": { - "macro_f1": 0.8683641924034757 + "macro_f1": 0.8713846348082936 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8583089323083904 + }, + "wrime_classification": { + "macro_f1": 0.4323129039345514 } }, "Reranking": { "esci": { - "ndcg@10": 0.9301469431250418 + "ndcg@10": 0.9301525338489429 + }, + "jacwir_reranking": { + "ndcg@10": 0.8827390816541736 + }, + "jqara": { + "ndcg@10": 0.6070225247152883 + }, + "miracl_reranking": { + "ndcg@10": 0.8243623644224994 + }, + "mldr_reranking": { + "ndcg@10": 0.887121388271364 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8385011452405416 + }, "jagovfaqs_22k": { - "ndcg@10": 0.6979374757372254 + "ndcg@10": 0.6984652569482365 }, "jaqket": { - "ndcg@10": 0.6729417850207029 + "ndcg@10": 0.6751948574643762 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3957491894384977 + }, + "miracl_retrieval": { + "ndcg@10": 0.652881832622734 + }, + "mldr_retrieval": { + "ndcg@10": 0.3374776122444277 }, "mrtydi": { - "ndcg@10": 0.41858579533990486 + "ndcg@10": 0.4167021902708705 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.899055473429718 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.9029337913460675 + "ndcg@10": 0.9008045583912581 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9511153967130517 + "ndcg@10": 0.9566816164352073 }, "nlp_journal_title_intro": { - "ndcg@10": 0.7580448576047344 + "ndcg@10": 0.757906107708436 } }, "STS": { "jsick": { - "spearman": 0.849637366944316 + "spearman": 0.8494858386977019 }, "jsts": { - "spearman": 0.8095684318108997 + "spearman": 0.8095670694135243 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5151536908540161 + "v_measure_score": 0.5446091559116468 }, "mewsc16": { - "v_measure_score": 0.45782610528001805 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.623716814159292 + "v_measure_score": 0.4611859858929692 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.43979504978761347 } } -} +} \ No newline at end of file diff --git a/docs/results/pkshatech/GLuCoSE-base-ja/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json index 9048691..045be96 100644 --- a/docs/results/pkshatech/GLuCoSE-base-ja/summary.json +++ b/docs/results/pkshatech/GLuCoSE-base-ja/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.8243606275521169 + "macro_f1": 0.8203088346974938 }, "amazon_review_classification": { - "macro_f1": 0.580654308041878 + "macro_f1": 0.5793470941382456 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9289309593569228 }, "massive_intent_classification": { - "macro_f1": 0.7885427536904928 + "macro_f1": 0.7852003872158392 }, "massive_scenario_classification": { - "macro_f1": 0.8794225134482166 + "macro_f1": 0.8771105186592234 + }, + "sib200_japanese_classification": { + "macro_f1": 0.7723533533184818 + }, + "wrime_classification": { + "macro_f1": 0.48820317778534994 } }, "Reranking": { "esci": { - "ndcg@10": 0.9190289767663239 + "ndcg@10": 0.9182072351783757 + }, + "jacwir_reranking": { + "ndcg@10": 0.7453523153562407 + }, + "jqara": { + "ndcg@10": 0.30235678517238046 + }, + "miracl_reranking": { + "ndcg@10": 0.7782487998017047 + }, + "mldr_reranking": { + "ndcg@10": 0.8742431547482784 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.6929937892822252 + }, "jagovfaqs_22k": { - "ndcg@10": 0.6387979415478197 + "ndcg@10": 0.6414300605061649 }, "jaqket": { - "ndcg@10": 0.3981609655991592 + "ndcg@10": 0.39775627519142726 + }, + "mintaka_retrieval": { + "ndcg@10": 0.2981097485323552 + }, + "miracl_retrieval": { + "ndcg@10": 0.4826861479972318 + }, + "mldr_retrieval": { + "ndcg@10": 0.2507030467719784 }, "mrtydi": { - "ndcg@10": 0.30281316435910444 + "ndcg@10": 0.3013997193651328 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.7677861541704494 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.7825765249971093 + "ndcg@10": 0.7720777474520221 }, "nlp_journal_title_abs": { - "ndcg@10": 0.8206371528870603 + "ndcg@10": 0.8139955508348415 }, "nlp_journal_title_intro": { - "ndcg@10": 0.5982476164344701 + "ndcg@10": 0.5843440022515908 } }, "STS": { "jsick": { - "spearman": 0.7496711324072552 + "spearman": 0.7489963692364312 }, "jsts": { - "spearman": 0.824592262812859 + "spearman": 0.8246470658338377 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.49890886040948096 + "v_measure_score": 0.5022894818692664 }, "mewsc16": { - "v_measure_score": 0.49676862904881375 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.663883089770355 + "v_measure_score": 0.4952409837584659 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.41426282292221306 } } } \ No newline at end of file diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json index d82af4b..ed2a807 100644 --- a/docs/results/pkshatech/RoSEtta-base-ja/summary.json +++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7005147244958231 + "macro_f1": 0.7021400751808275 }, "amazon_review_classification": { - "macro_f1": 0.5263680453119501 + "macro_f1": 0.5222844241125081 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8728387064627037 }, "massive_intent_classification": { - "macro_f1": 0.7983787583297884 + "macro_f1": 0.7958661089844552 }, "massive_scenario_classification": { - "macro_f1": 0.8709593192703351 + "macro_f1": 0.869642477269303 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8400507949086808 + }, + "wrime_classification": { + "macro_f1": 0.41243251223612126 } }, "Reranking": { "esci": { - "ndcg@10": 0.9268625513429571 + "ndcg@10": 0.9267798900027316 + }, + "jacwir_reranking": { + "ndcg@10": 0.8682926176464301 + }, + "jqara": { + "ndcg@10": 0.5792158527364997 + }, + "miracl_reranking": { + "ndcg@10": 0.8038167802919151 + }, + "mldr_reranking": { + "ndcg@10": 0.8844542290758788 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8201713015308671 + }, "jagovfaqs_22k": { - "ndcg@10": 0.6595934642903105 + "ndcg@10": 0.6624123008303046 }, "jaqket": { - "ndcg@10": 0.6533452086105761 + "ndcg@10": 0.6534322606981797 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3404237377925581 + }, + "miracl_retrieval": { + "ndcg@10": 0.6019862449112752 + }, + "mldr_retrieval": { + "ndcg@10": 0.3236631225997826 }, "mrtydi": { - "ndcg@10": 0.36731170141136216 + "ndcg@10": 0.36773428568023436 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9604317247356383 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.9553567926226499 + "ndcg@10": 0.9541194598644321 }, "nlp_journal_title_abs": { - "ndcg@10": 0.940828991756893 + "ndcg@10": 0.931681815900694 }, "nlp_journal_title_intro": { - "ndcg@10": 0.8163161967769845 + "ndcg@10": 0.821937205258955 } }, "STS": { "jsick": { - "spearman": 0.8383455453168481 + "spearman": 0.8383423614590403 }, "jsts": { - "spearman": 0.7895388048564987 + "spearman": 0.7894639448529204 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5861760622672214 + "v_measure_score": 0.5617913119479273 }, "mewsc16": { - "v_measure_score": 0.4784844036038961 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6173974540311173 + "v_measure_score": 0.4515710456360326 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4060764834036522 } } -} +} \ No newline at end of file diff --git a/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json index cc9f179..08fb2cf 100644 --- a/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json +++ b/docs/results/pkshatech/simcse-ja-bert-base-clcmlp/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.6748573563374541 + "macro_f1": 0.6623457108919073 }, "amazon_review_classification": { - "macro_f1": 0.5084883283463678 + "macro_f1": 0.5085668578780138 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8727921560142167 }, "massive_intent_classification": { - "macro_f1": 0.7967050091211104 + "macro_f1": 0.7964832948145142 }, "massive_scenario_classification": { - "macro_f1": 0.871999260591497 + "macro_f1": 0.8722583552883876 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8118131918956941 + }, + "wrime_classification": { + "macro_f1": 0.38393198133793865 } }, "Reranking": { "esci": { - "ndcg@10": 0.914930352019688 + "ndcg@10": 0.9159934732688085 + }, + "jacwir_reranking": { + "ndcg@10": 0.57455569033817 + }, + "jqara": { + "ndcg@10": 0.32394940899755914 + }, + "miracl_reranking": { + "ndcg@10": 0.7155007519708649 + }, + "mldr_reranking": { + "ndcg@10": 0.8749859006713937 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.4504173219483955 + }, "jagovfaqs_22k": { - "ndcg@10": 0.41496851385134836 + "ndcg@10": 0.42184475447157466 }, "jaqket": { - "ndcg@10": 0.46003031782136106 + "ndcg@10": 0.45790883763166734 + }, + "mintaka_retrieval": { + "ndcg@10": 0.3129516236109114 + }, + "miracl_retrieval": { + "ndcg@10": 0.16941478465313356 + }, + "mldr_retrieval": { + "ndcg@10": 0.20077263817507693 }, "mrtydi": { - "ndcg@10": 0.1019130492122431 + "ndcg@10": 0.10152904724472846 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.3813451499418741 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.4014036990267884 + "ndcg@10": 0.3760245554186644 }, "nlp_journal_title_abs": { - "ndcg@10": 0.5962532652358485 + "ndcg@10": 0.5918422105100428 }, "nlp_journal_title_intro": { - "ndcg@10": 0.2452584471710635 + "ndcg@10": 0.25260061985270044 } }, "STS": { "jsick": { - "spearman": 0.7307715649457595 + "spearman": 0.7310527928257868 }, "jsts": { - "spearman": 0.8052279921326252 + "spearman": 0.8050903530724467 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.4476707933600858 + "v_measure_score": 0.4270804414565236 }, "mewsc16": { - "v_measure_score": 0.5029508725037098 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6239830208701805 + "v_measure_score": 0.5188641339887974 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4361034033018593 } } } \ No newline at end of file diff --git a/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json b/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json index 30385ec..20e3090 100644 --- a/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json +++ b/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7910202863961814 + "macro_f1": 0.7966249319542733 }, "amazon_review_classification": { - "macro_f1": 0.614759364446128 + "macro_f1": 0.6204802870356217 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.9487448938934042 }, "massive_intent_classification": { - "macro_f1": 0.8225880728874561 + "macro_f1": 0.8121127783146885 }, "massive_scenario_classification": { - "macro_f1": 0.9065030576701741 + "macro_f1": 0.9015618520645106 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8262549610016919 + }, + "wrime_classification": { + "macro_f1": 0.496952794347916 } }, "Reranking": { "esci": { - "ndcg@10": 0.9374394712541568 + "ndcg@10": 0.9359176437229035 + }, + "jacwir_reranking": { + "ndcg@10": 0.8684667204236405 + }, + "jqara": { + "ndcg@10": 0.6592446626934351 + }, + "miracl_reranking": { + "ndcg@10": 0.851895889010368 + }, + "mldr_reranking": { + "ndcg@10": 0.9024168764200886 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.8242898079860301 + }, "jagovfaqs_22k": { - "ndcg@10": 0.7168374490004555 + "ndcg@10": 0.7175523394252279 }, "jaqket": { - "ndcg@10": 0.7279485535689915 + "ndcg@10": 0.6651645644179811 + }, + "mintaka_retrieval": { + "ndcg@10": 0.6260117718497401 + }, + "miracl_retrieval": { + "ndcg@10": 0.6324868715639211 + }, + "mldr_retrieval": { + "ndcg@10": 0.3458953565848906 }, "mrtydi": { - "ndcg@10": 0.41952210141116814 + "ndcg@10": 0.4075091710258615 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.9919931534803926 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.9394095717236127 + "ndcg@10": 0.9916030162169888 }, "nlp_journal_title_abs": { - "ndcg@10": 0.9695624263086593 + "ndcg@10": 0.968506421217649 }, "nlp_journal_title_intro": { - "ndcg@10": 0.8832876426024624 + "ndcg@10": 0.9629377323425067 } }, "STS": { "jsick": { - "spearman": 0.8022484725822061 + "spearman": 0.7979403746663343 }, "jsts": { - "spearman": 0.851980317221987 + "spearman": 0.8362521198880197 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.5641831341687762 + "v_measure_score": 0.5580857807899353 }, "mewsc16": { - "v_measure_score": 0.5129216698739159 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.62 + "v_measure_score": 0.5068875864473731 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.4418928761777483 } } } \ No newline at end of file diff --git a/docs/results/sentence-transformers/LaBSE/summary.json b/docs/results/sentence-transformers/LaBSE/summary.json index de8fd21..be25868 100644 --- a/docs/results/sentence-transformers/LaBSE/summary.json +++ b/docs/results/sentence-transformers/LaBSE/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7361214773958769 + "macro_f1": 0.7473900578785092 }, "amazon_review_classification": { - "macro_f1": 0.516957890685124 + "macro_f1": 0.5163267880432743 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8943804239578887 }, "massive_intent_classification": { - "macro_f1": 0.7698802987251081 + "macro_f1": 0.7708783013419095 }, "massive_scenario_classification": { - "macro_f1": 0.8835366493433755 + "macro_f1": 0.883882574111003 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8147469939175009 + }, + "wrime_classification": { + "macro_f1": 0.4010561963802254 } }, "Reranking": { "esci": { - "ndcg@10": 0.9162507647227857 + "ndcg@10": 0.9160790861014678 + }, + "jacwir_reranking": { + "ndcg@10": 0.6785244283016075 + }, + "jqara": { + "ndcg@10": 0.2488300870810785 + }, + "miracl_reranking": { + "ndcg@10": 0.6956277678029864 + }, + "mldr_reranking": { + "ndcg@10": 0.818396899799895 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.49122610922285737 + }, "jagovfaqs_22k": { - "ndcg@10": 0.4310160105414995 + "ndcg@10": 0.4278654773482106 }, "jaqket": { - "ndcg@10": 0.34245849139132745 + "ndcg@10": 0.34162439290480445 + }, + "mintaka_retrieval": { + "ndcg@10": 0.20021150938693902 + }, + "miracl_retrieval": { + "ndcg@10": 0.09640164259724278 + }, + "mldr_retrieval": { + "ndcg@10": 0.07525879379433965 }, "mrtydi": { - "ndcg@10": 0.04238747941951049 + "ndcg@10": 0.04221321214455149 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.48063138821949475 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.48918127058907085 + "ndcg@10": 0.48202233374429526 }, "nlp_journal_title_abs": { - "ndcg@10": 0.7513086500303519 + "ndcg@10": 0.7559363652226313 }, "nlp_journal_title_intro": { - "ndcg@10": 0.35089108319096984 + "ndcg@10": 0.3553481928114969 } }, "STS": { "jsick": { - "spearman": 0.7698905918950973 + "spearman": 0.770087314840748 }, "jsts": { - "spearman": 0.7612337568248777 + "spearman": 0.7611615118281959 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.4829337123233023 + "v_measure_score": 0.48580223329334865 }, "mewsc16": { - "v_measure_score": 0.41471299546625956 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.623321554770318 + "v_measure_score": 0.41072280934417754 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.2859403214333406 } } } \ No newline at end of file diff --git a/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json index 12f71a2..a856998 100644 --- a/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json +++ b/docs/results/sentence-transformers/stsb-xlm-r-multilingual/summary.json @@ -1,62 +1,96 @@ { "Classification": { "amazon_counterfactual_classification": { - "macro_f1": 0.7565022696601644 + "macro_f1": 0.7514299930187799 }, "amazon_review_classification": { - "macro_f1": 0.5131771609073525 + "macro_f1": 0.5113667375588362 + }, + "japanese_sentiment_classification": { + "macro_f1": 0.8653980041773033 }, "massive_intent_classification": { - "macro_f1": 0.7427818411370812 + "macro_f1": 0.7433839585058197 }, "massive_scenario_classification": { - "macro_f1": 0.8609512679368835 + "macro_f1": 0.8606582397219589 + }, + "sib200_japanese_classification": { + "macro_f1": 0.8372998969612304 + }, + "wrime_classification": { + "macro_f1": 0.4167776597670575 } }, "Reranking": { "esci": { - "ndcg@10": 0.901984958764163 + "ndcg@10": 0.901890341341842 + }, + "jacwir_reranking": { + "ndcg@10": 0.39624043125448744 + }, + "jqara": { + "ndcg@10": 0.2151865684492273 + }, + "miracl_reranking": { + "ndcg@10": 0.659024218324574 + }, + "mldr_reranking": { + "ndcg@10": 0.768787823495723 } }, "Retrieval": { + "jacwir_retrieval": { + "ndcg@10": 0.21305201715296948 + }, "jagovfaqs_22k": { - "ndcg@10": 0.2511106863952595 + "ndcg@10": 0.2556849980765764 }, "jaqket": { - "ndcg@10": 0.21606007987072834 + "ndcg@10": 0.216905594393324 + }, + "mintaka_retrieval": { + "ndcg@10": 0.22312923127278733 + }, + "miracl_retrieval": { + "ndcg@10": 0.025873352550354844 + }, + "mldr_retrieval": { + "ndcg@10": 0.06529330431356167 }, "mrtydi": { - "ndcg@10": 0.027590779174942116 + "ndcg@10": 0.027849411947159904 + }, + "nlp_journal_abs_article": { + "ndcg@10": 0.24914118502751986 }, "nlp_journal_abs_intro": { - "ndcg@10": 0.2848558252647936 + "ndcg@10": 0.2554860092306942 }, "nlp_journal_title_abs": { - "ndcg@10": 0.3646520309406354 + "ndcg@10": 0.35835508156998896 }, "nlp_journal_title_intro": { - "ndcg@10": 0.11545016260271045 + "ndcg@10": 0.12133118349638791 } }, "STS": { "jsick": { - "spearman": 0.7236409557069434 + "spearman": 0.7238085290735078 }, "jsts": { - "spearman": 0.7843597058304203 + "spearman": 0.784483411606707 } }, "Clustering": { "livedoor_news": { - "v_measure_score": 0.24487129939212224 + "v_measure_score": 0.26373496762588294 }, "mewsc16": { - "v_measure_score": 0.304278393205056 - } - }, - "PairClassification": { - "paws_x_ja": { - "binary_f1": 0.6219686162624821 + "v_measure_score": 0.32418419167915596 + }, + "sib200_japanese_clustering": { + "v_measure_score": 0.2434250739162938 } } } \ No newline at end of file diff --git a/leaderboard.md b/leaderboard.md index dd64309..34d5d62 100644 --- a/leaderboard.md +++ b/leaderboard.md @@ -5,233 +5,254 @@ This leaderboard shows the results stored under `docs/results`. The scores are a The summary shows the average scores within each task. The average score is the average of scores by dataset. -| Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | PairClassification | -|:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------| -| sbintuitions/sarashina-embedding-v1-1b | **75.50** | **77.61** | 82.71 | **78.37** | **93.74** | 53.86 | 62.00 | -| OpenAI/text-embedding-3-large | 74.05 | 74.48 | 82.52 | 77.58 | 93.58 | 53.32 | 62.35 | -| jinaai/jina-embeddings-v3 | 73.44 | 75.22 | 80.05 | 76.39 | 92.71 | 51.46 | 62.37 | -| cl-nagoya/ruri-large | 73.31 | 73.02 | 83.13 | 77.43 | 92.99 | 51.82 | 62.29 | -| pkshatech/GLuCoSE-base-ja-v2 | 72.23 | 73.36 | 82.96 | 74.21 | 93.01 | 48.65 | 62.37 | -| pkshatech/RoSEtta-base-ja | 72.04 | 73.21 | 81.39 | 72.41 | 92.69 | 53.23 | 61.74 | -| cl-nagoya/ruri-base | 71.91 | 69.82 | 82.87 | 75.58 | 92.91 | **54.16** | 62.38 | -| cl-nagoya/ruri-small | 71.53 | 69.41 | 82.79 | 76.22 | 93.00 | 51.19 | 62.11 | -| intfloat/multilingual-e5-large | 70.90 | 70.98 | 79.70 | 72.89 | 92.96 | 51.24 | 62.15 | -| OpenAI/text-embedding-3-small | 69.18 | 66.39 | 79.46 | 73.06 | 92.92 | 51.06 | 62.27 | -| intfloat/multilingual-e5-base | 68.61 | 68.21 | 79.84 | 69.30 | 92.85 | 48.26 | 62.26 | -| intfloat/multilingual-e5-small | 67.71 | 67.27 | 80.07 | 67.62 | 93.03 | 46.91 | 62.19 | -| pkshatech/GLuCoSE-base-ja | 67.29 | 59.02 | 78.71 | 76.82 | 91.90 | 49.78 | **66.39** | -| OpenAI/text-embedding-ada-002 | 67.21 | 64.38 | 79.02 | 69.75 | 93.04 | 48.30 | 62.40 | -| cl-nagoya/sup-simcse-ja-base | 63.36 | 49.64 | 82.05 | 73.47 | 91.83 | 51.79 | 62.57 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 61.55 | 47.38 | 78.99 | 73.13 | 91.30 | 48.25 | 62.27 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 60.83 | 46.36 | 77.49 | 73.30 | 91.16 | 46.68 | 62.38 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 60.77 | 43.00 | 76.60 | 76.61 | 91.56 | 50.33 | 62.38 | -| cl-nagoya/unsup-simcse-ja-large | 59.58 | 40.53 | 80.56 | 74.66 | 90.95 | 48.41 | 62.49 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 59.03 | 42.59 | 77.05 | 72.90 | 91.01 | 44.95 | 62.33 | -| cl-nagoya/sup-simcse-ja-large | 58.88 | 37.62 | **83.18** | 73.73 | 91.48 | 50.56 | 62.51 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 58.77 | 40.82 | 78.28 | 73.47 | 90.95 | 45.81 | 62.35 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 58.66 | 41.79 | 75.50 | 73.77 | 90.95 | 44.22 | 62.38 | -| cl-nagoya/unsup-simcse-ja-base | 58.39 | 40.23 | 78.72 | 73.07 | 91.16 | 44.77 | 62.44 | -| sentence-transformers/LaBSE | 58.01 | 40.12 | 76.56 | 72.66 | 91.63 | 44.88 | 62.33 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 57.97 | 41.32 | 74.66 | 72.76 | 90.66 | 43.11 | 62.37 | -| pkshatech/simcse-ja-bert-base-clcmlp | 56.86 | 37.00 | 76.80 | 71.30 | 91.49 | 47.53 | 62.40 | -| sentence-transformers/stsb-xlm-r-multilingual | 48.21 | 21.00 | 75.40 | 71.84 | 90.20 | 27.46 | 62.20 | -| colorfulscoop/sbert-base-ja | 47.38 | 16.52 | 70.42 | 69.07 | 89.97 | 44.81 | 62.31 | +| Model | Avg. | Retrieval | STS | Classification | Reranking | Clustering | +|:----------------------------------------------|:---------:|:-----------:|:---------:|:----------------:|:-----------:|:------------:| +| sbintuitions/sarashina-embedding-v2-1b | **76.38** | **76.48** | **84.22** | 77.14 | **86.28** | 52.56 | +| cl-nagoya/ruri-v3-310m | 75.85 | 76.03 | 81.59 | **77.65** | 85.84 | 50.52 | +| cl-nagoya/ruri-v3-130m | 75.52 | 76.45 | 81.05 | 75.65 | 85.71 | 51.13 | +| sbintuitions/sarashina-embedding-v1-1b | 74.87 | 74.53 | 81.71 | 77.20 | 84.36 | 50.30 | +| cl-nagoya/ruri-v3-70m | 73.95 | 74.23 | 80.96 | 74.45 | 84.21 | 49.95 | +| OpenAI/text-embedding-3-large | 73.86 | 71.95 | 82.52 | 77.27 | 83.06 | 51.82 | +| cl-nagoya/ruri-large-v2 | 73.63 | 71.87 | 83.18 | 76.10 | 83.89 | 50.88 | +| cl-nagoya/ruri-v3-30m | 72.95 | 72.84 | 81.78 | 73.35 | 82.93 | 49.90 | +| cl-nagoya/ruri-large | 71.69 | 68.30 | 83.13 | 76.25 | 81.26 | 49.93 | +| cl-nagoya/ruri-base-v2 | 71.66 | 68.96 | 83.03 | 75.59 | 82.46 | 46.84 | +| cl-nagoya/ruri-small-v2 | 71.40 | 68.46 | 82.91 | 74.12 | 82.30 | 49.97 | +| pkshatech/GLuCoSE-base-ja-v2 | 71.11 | 68.45 | 82.95 | 73.52 | 82.63 | 48.19 | +| intfloat/multilingual-e5-large | 70.67 | 67.65 | 80.86 | 72.30 | 83.01 | 50.58 | +| google/embeddinggemma-300m | 70.59 | 65.91 | 82.74 | 76.14 | 80.93 | 49.48 | +| cl-nagoya/ruri-base | 70.25 | 65.90 | 82.88 | 75.34 | 80.31 | 49.10 | +| pkshatech/RoSEtta-base-ja | 69.58 | 67.52 | 81.39 | 71.70 | 81.25 | 44.88 | +| cl-nagoya/ruri-small | 69.34 | 63.95 | 82.79 | 74.83 | 79.98 | 49.59 | +| intfloat/multilingual-e5-base | 68.06 | 64.48 | 80.46 | 69.70 | 79.46 | 50.12 | +| intfloat/multilingual-e5-small | 67.38 | 63.91 | 80.46 | 67.77 | 80.09 | 49.29 | +| OpenAI/text-embedding-3-small | 67.10 | 61.79 | 79.46 | 72.43 | 77.29 | 48.91 | +| OpenAI/text-embedding-ada-002 | 65.13 | 59.58 | 79.02 | 69.39 | 75.63 | 48.78 | +| hotchpotch/static-embedding-japanese | 63.80 | 60.51 | 80.16 | 66.73 | 77.09 | 35.91 | +| pkshatech/GLuCoSE-base-ja | 63.79 | 54.58 | 78.68 | 75.02 | 72.37 | 47.12 | +| cl-nagoya/sup-simcse-ja-base | 59.91 | 45.00 | 82.05 | 72.72 | 70.36 | **52.57** | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 57.60 | 42.41 | 79.00 | 71.83 | 71.88 | 42.02 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 56.75 | 38.08 | 76.56 | 74.53 | 69.81 | 48.75 | +| cl-nagoya/sup-simcse-ja-large | 56.46 | 37.38 | 83.17 | 72.74 | 68.76 | 50.12 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 55.78 | 39.85 | 77.96 | 71.46 | 69.92 | 39.27 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 55.35 | 36.23 | 78.29 | 72.59 | 70.59 | 44.54 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 54.65 | 36.24 | 77.75 | 71.81 | 68.58 | 43.45 | +| cl-nagoya/unsup-simcse-ja-large | 54.23 | 33.98 | 80.56 | 73.71 | 67.39 | 43.52 | +| cl-nagoya/unsup-simcse-ja-base | 53.86 | 35.34 | 78.74 | 72.41 | 66.20 | 41.29 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 53.82 | 35.22 | 74.96 | 71.48 | 68.15 | 42.86 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 53.59 | 34.93 | 76.70 | 72.06 | 67.73 | 39.93 | +| pkshatech/simcse-ja-bert-base-clcmlp | 53.48 | 32.80 | 76.81 | 70.67 | 68.02 | 49.45 | +| sentence-transformers/LaBSE | 52.70 | 33.18 | 76.56 | 71.85 | 67.01 | 39.82 | +| sentence-transformers/stsb-xlm-r-multilingual | 43.06 | 16.58 | 75.41 | 71.40 | 57.93 | 27.67 | +| colorfulscoop/sbert-base-ja | 42.90 | 15.45 | 70.41 | 68.05 | 59.38 | 39.04 | ## Retrieval -| Model | Avg. | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mrtydi
(ndcg@10) | nlp_journal_abs_intro
(ndcg@10) | nlp_journal_title_abs
(ndcg@10) | nlp_journal_title_intro
(ndcg@10) | -|:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------| -| sbintuitions/sarashina-embedding-v1-1b | **77.61** | 71.68 | **72.79** | 41.95 | 93.94 | 96.96 | 88.33 | -| jinaai/jina-embeddings-v3 | 75.22 | 71.50 | 46.48 | **45.45** | 98.43 | 95.62 | 93.85 | -| OpenAI/text-embedding-3-large | 74.48 | 72.41 | 48.21 | 34.88 | **99.33** | 96.55 | **95.47** | -| pkshatech/GLuCoSE-base-ja-v2 | 73.36 | 69.79 | 67.29 | 41.86 | 90.29 | 95.11 | 75.80 | -| pkshatech/RoSEtta-base-ja | 73.21 | 65.96 | 65.33 | 36.73 | 95.54 | 94.08 | 81.63 | -| cl-nagoya/ruri-large | 73.02 | **76.68** | 61.74 | 38.03 | 87.12 | 96.58 | 77.97 | -| intfloat/multilingual-e5-large | 70.98 | 70.30 | 58.78 | 43.63 | 86.00 | 94.70 | 72.48 | -| cl-nagoya/ruri-base | 69.82 | 74.56 | 50.12 | 35.45 | 86.89 | 96.57 | 75.31 | -| cl-nagoya/ruri-small | 69.41 | 73.65 | 48.44 | 33.43 | 87.69 | **97.17** | 76.09 | -| intfloat/multilingual-e5-base | 68.21 | 65.34 | 50.67 | 38.38 | 87.10 | 94.73 | 73.05 | -| intfloat/multilingual-e5-small | 67.27 | 64.11 | 49.97 | 36.05 | 85.21 | 95.26 | 72.99 | -| OpenAI/text-embedding-3-small | 66.39 | 64.02 | 33.94 | 20.03 | 98.47 | 91.70 | 90.17 | -| OpenAI/text-embedding-ada-002 | 64.38 | 61.02 | 42.56 | 14.51 | 94.99 | 91.23 | 81.98 | -| pkshatech/GLuCoSE-base-ja | 59.02 | 63.88 | 39.82 | 30.28 | 78.26 | 82.06 | 59.82 | -| cl-nagoya/sup-simcse-ja-base | 49.64 | 51.62 | 50.25 | 13.98 | 68.08 | 65.71 | 48.22 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 47.38 | 50.14 | 45.84 | 13.00 | 55.09 | 74.97 | 45.24 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.36 | 47.39 | 39.57 | 11.44 | 64.16 | 70.23 | 45.37 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 43.00 | 51.99 | 42.07 | 10.12 | 49.30 | 71.94 | 32.59 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 42.59 | 42.37 | 37.72 | 7.88 | 63.70 | 64.13 | 39.73 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 41.79 | 42.31 | 36.20 | 7.81 | 60.77 | 64.34 | 39.32 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 41.32 | 44.11 | 39.61 | 8.15 | 62.76 | 58.39 | 34.89 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 40.82 | 47.04 | 44.10 | 11.43 | 43.43 | 62.41 | 36.52 | -| cl-nagoya/unsup-simcse-ja-large | 40.53 | 45.09 | 34.60 | 5.75 | 55.07 | 63.07 | 39.61 | -| cl-nagoya/unsup-simcse-ja-base | 40.23 | 46.00 | 39.46 | 5.55 | 60.26 | 55.63 | 34.49 | -| sentence-transformers/LaBSE | 40.12 | 43.10 | 34.25 | 4.24 | 48.92 | 75.13 | 35.09 | -| cl-nagoya/sup-simcse-ja-large | 37.62 | 46.84 | 39.88 | 11.83 | 63.43 | 37.93 | 25.79 | -| pkshatech/simcse-ja-bert-base-clcmlp | 37.00 | 41.50 | 46.00 | 10.19 | 40.14 | 59.63 | 24.53 | -| sentence-transformers/stsb-xlm-r-multilingual | 21.00 | 25.11 | 21.61 | 2.76 | 28.49 | 36.47 | 11.55 | -| colorfulscoop/sbert-base-ja | 16.52 | 21.50 | 13.16 | 0.44 | 28.78 | 22.40 | 12.82 | +| Model | Avg. | jacwir_retrieval
(ndcg@10) | jagovfaqs_22k
(ndcg@10) | jaqket
(ndcg@10) | mintaka_retrieval
(ndcg@10) | miracl_retrieval
(ndcg@10) | mldr_retrieval
(ndcg@10) | mrtydi
(ndcg@10) | nlp_abs_article
(ndcg@10) | nlp_abs_intro
(ndcg@10) | nlp_title_abs
(ndcg@10) | nlp_title_intro
(ndcg@10) | +|:----------------------------------------------|:---------:|:-------------------------------:|:----------------------------:|:---------------------:|:--------------------------------:|:-------------------------------:|:-----------------------------:|:---------------------:|:------------------------------:|:----------------------------:|:----------------------------:|:------------------------------:| +| sbintuitions/sarashina-embedding-v2-1b | **76.48** | 85.54 | 74.87 | **73.52** | **66.11** | 68.26 | 40.35 | **49.57** | 96.84 | 96.28 | **98.11** | 91.79 | +| cl-nagoya/ruri-v3-130m | 76.45 | 84.21 | 75.32 | 73.10 | 51.77 | **71.01** | 45.16 | 47.80 | 99.51 | 98.88 | 97.95 | 96.28 | +| cl-nagoya/ruri-v3-310m | 76.03 | 84.06 | 76.49 | 71.87 | 52.25 | 67.71 | 43.43 | 47.06 | **99.59** | **99.35** | 97.91 | **96.58** | +| sbintuitions/sarashina-embedding-v1-1b | 74.53 | 82.43 | 71.76 | 72.92 | 62.60 | 63.23 | 34.59 | 40.75 | 99.20 | 99.16 | 96.85 | 96.29 | +| cl-nagoya/ruri-v3-70m | 74.23 | 82.76 | 73.27 | 67.68 | 46.26 | 67.98 | 43.55 | 45.00 | 98.50 | 98.68 | 97.07 | 95.73 | +| cl-nagoya/ruri-v3-30m | 72.84 | 82.70 | 70.21 | 62.45 | 43.05 | 64.99 | **45.77** | 41.78 | 98.76 | 99.16 | 96.99 | 95.34 | +| OpenAI/text-embedding-3-large | 71.95 | 82.90 | 72.41 | 48.21 | 63.52 | 60.57 | 45.26 | 34.88 | 92.37 | 99.33 | 96.55 | 95.47 | +| cl-nagoya/ruri-large-v2 | 71.87 | 80.49 | **78.23** | 65.61 | 50.41 | 70.46 | 36.97 | 46.37 | 90.85 | 91.15 | 97.74 | 82.32 | +| cl-nagoya/ruri-base-v2 | 68.96 | 81.01 | 75.90 | 57.01 | 44.18 | 68.22 | 37.73 | 40.89 | 88.05 | 89.73 | 96.96 | 78.93 | +| cl-nagoya/ruri-small-v2 | 68.46 | 83.04 | 74.02 | 62.25 | 35.31 | 66.90 | 32.58 | 42.40 | 90.65 | 90.42 | 97.30 | 78.21 | +| pkshatech/GLuCoSE-base-ja-v2 | 68.45 | 83.85 | 69.85 | 67.52 | 39.57 | 65.29 | 33.75 | 41.67 | 89.91 | 90.08 | 95.67 | 75.79 | +| cl-nagoya/ruri-large | 68.30 | 81.69 | 77.64 | 61.73 | 51.06 | 55.47 | 34.77 | 38.12 | 86.53 | 88.91 | 96.17 | 79.22 | +| intfloat/multilingual-e5-large | 67.65 | **86.41** | 72.98 | 59.67 | 39.59 | 70.96 | 29.85 | 47.82 | 83.26 | 85.71 | 95.29 | 72.57 | +| pkshatech/RoSEtta-base-ja | 67.52 | 82.02 | 66.28 | 64.28 | 34.04 | 60.16 | 32.37 | 36.77 | 96.04 | 95.41 | 93.17 | 82.19 | +| google/embeddinggemma-300m | 65.91 | 81.07 | 69.43 | 63.27 | 38.63 | 35.28 | 34.66 | 13.86 | 99.34 | 99.02 | 96.12 | 94.35 | +| cl-nagoya/ruri-base | 65.90 | 82.48 | 75.50 | 50.23 | 45.37 | 54.88 | 35.42 | 35.59 | 86.65 | 87.23 | 95.27 | 76.25 | +| intfloat/multilingual-e5-base | 64.48 | 84.32 | 68.72 | 51.69 | 34.68 | 64.50 | 25.73 | 42.30 | 83.56 | 84.48 | 94.62 | 74.70 | +| cl-nagoya/ruri-small | 63.95 | 82.58 | 74.01 | 48.44 | 37.23 | 52.22 | 28.99 | 33.51 | 86.89 | 87.23 | 96.20 | 76.09 | +| intfloat/multilingual-e5-small | 63.91 | 85.58 | 65.69 | 51.57 | 31.54 | 63.23 | 25.91 | 42.37 | 83.97 | 84.10 | 94.47 | 74.56 | +| OpenAI/text-embedding-3-small | 61.79 | 79.58 | 64.02 | 33.94 | 32.44 | 48.45 | 35.07 | 20.03 | 85.83 | 98.47 | 91.70 | 90.17 | +| hotchpotch/static-embedding-japanese | 60.51 | 72.27 | 55.55 | 64.04 | 38.93 | 32.61 | 42.51 | 11.18 | 76.19 | 95.74 | 90.37 | 86.25 | +| OpenAI/text-embedding-ada-002 | 59.58 | 78.08 | 61.02 | 42.56 | 27.09 | 34.54 | 31.90 | 14.51 | 97.51 | 94.99 | 91.23 | 81.98 | +| pkshatech/GLuCoSE-base-ja | 54.58 | 69.30 | 64.14 | 39.78 | 29.81 | 48.27 | 25.07 | 30.14 | 76.78 | 77.21 | 81.40 | 58.43 | +| cl-nagoya/sup-simcse-ja-base | 45.00 | 53.32 | 52.02 | 50.13 | 32.88 | 20.68 | 24.70 | 14.14 | 69.09 | 66.19 | 64.84 | 46.97 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 42.41 | 53.16 | 51.20 | 45.81 | 30.42 | 26.08 | 23.65 | 13.06 | 54.65 | 52.13 | 74.13 | 42.21 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 39.85 | 48.95 | 48.41 | 38.73 | 25.72 | 21.60 | 18.11 | 11.02 | 58.91 | 60.05 | 69.15 | 37.72 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 38.08 | 59.65 | 54.07 | 40.22 | 24.83 | 17.19 | 19.08 | 10.09 | 44.07 | 44.84 | 73.68 | 31.15 | +| cl-nagoya/sup-simcse-ja-large | 37.38 | 43.71 | 47.42 | 40.04 | 37.68 | 18.13 | 23.48 | 11.88 | 64.08 | 62.95 | 36.95 | 24.90 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 36.24 | 42.43 | 43.60 | 37.35 | 25.18 | 14.76 | 16.86 | 7.77 | 56.89 | 59.11 | 61.81 | 32.88 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 36.23 | 46.28 | 48.25 | 44.17 | 28.89 | 19.52 | 18.66 | 11.44 | 43.97 | 40.33 | 60.49 | 36.51 | +| cl-nagoya/unsup-simcse-ja-base | 35.34 | 35.11 | 46.74 | 39.52 | 29.92 | 10.93 | 15.98 | 5.51 | 58.22 | 58.41 | 55.58 | 32.84 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 35.22 | 39.92 | 44.60 | 38.45 | 22.39 | 13.94 | 13.91 | 7.30 | 58.35 | 58.63 | 57.43 | 32.47 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 34.93 | 40.86 | 43.88 | 35.56 | 19.98 | 16.52 | 12.06 | 7.11 | 54.30 | 55.86 | 62.96 | 35.17 | +| cl-nagoya/unsup-simcse-ja-large | 33.98 | 37.61 | 46.56 | 34.53 | 30.58 | 10.33 | 12.55 | 5.75 | 50.45 | 50.70 | 60.43 | 34.32 | +| sentence-transformers/LaBSE | 33.18 | 49.12 | 42.43 | 24.92 | 20.02 | 9.36 | 7.53 | 4.22 | 48.06 | 48.20 | 75.59 | 35.53 | +| pkshatech/simcse-ja-bert-base-clcmlp | 32.80 | 45.03 | 41.00 | 37.01 | 31.30 | 16.07 | 20.08 | 10.15 | 38.13 | 37.60 | 59.18 | 25.26 | +| sentence-transformers/stsb-xlm-r-multilingual | 16.58 | 21.08 | 22.49 | 6.49 | 22.31 | 2.28 | 6.53 | 2.78 | 24.91 | 25.55 | 35.84 | 12.13 | +| colorfulscoop/sbert-base-ja | 15.45 | 19.30 | 21.70 | 13.14 | 19.07 | 1.86 | 6.97 | 0.41 | 29.02 | 25.80 | 21.07 | 11.57 | ## STS -| Model | Avg. | jsick
(spearman) | jsts
(spearman) | -|:----------------------------------------------|:----------|:----------------------|:---------------------| -| cl-nagoya/sup-simcse-ja-large | **83.18** | 83.80 | 82.57 | -| cl-nagoya/ruri-large | 83.13 | 82.00 | 84.26 | -| pkshatech/GLuCoSE-base-ja-v2 | 82.96 | **84.96** | 80.96 | -| cl-nagoya/ruri-base | 82.87 | 82.32 | 83.43 | -| cl-nagoya/ruri-small | 82.79 | 83.44 | 82.13 | -| sbintuitions/sarashina-embedding-v1-1b | 82.71 | 80.22 | **85.20** | -| OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 | -| cl-nagoya/sup-simcse-ja-base | 82.05 | 82.83 | 81.27 | -| pkshatech/RoSEtta-base-ja | 81.39 | 83.83 | 78.95 | -| cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.15 | 80.98 | -| intfloat/multilingual-e5-small | 80.07 | 81.50 | 78.65 | -| jinaai/jina-embeddings-v3 | 80.05 | 78.16 | 81.93 | -| intfloat/multilingual-e5-base | 79.84 | 81.28 | 78.39 | -| intfloat/multilingual-e5-large | 79.70 | 78.40 | 80.99 | -| OpenAI/text-embedding-3-small | 79.46 | 80.83 | 78.08 | -| OpenAI/text-embedding-ada-002 | 79.02 | 79.09 | 78.94 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 78.99 | 79.84 | 78.14 | -| cl-nagoya/unsup-simcse-ja-base | 78.72 | 78.49 | 78.95 | -| pkshatech/GLuCoSE-base-ja | 78.71 | 74.97 | 82.46 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 78.28 | 78.75 | 77.81 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 77.49 | 78.18 | 76.81 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 77.05 | 77.57 | 76.53 | -| pkshatech/simcse-ja-bert-base-clcmlp | 76.80 | 73.08 | 80.52 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.60 | 72.11 | 81.09 | -| sentence-transformers/LaBSE | 76.56 | 76.99 | 76.12 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 75.50 | 75.42 | 75.58 | -| sentence-transformers/stsb-xlm-r-multilingual | 75.40 | 72.36 | 78.44 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 74.66 | 74.64 | 74.68 | -| colorfulscoop/sbert-base-ja | 70.42 | 66.59 | 74.24 | +| Model | Avg. | jsick
(spearman) | jsts
(spearman) | +|:----------------------------------------------|:---------:|:---------------------:|:--------------------:| +| sbintuitions/sarashina-embedding-v2-1b | **84.22** | 82.58 | **85.87** | +| cl-nagoya/ruri-large-v2 | 83.18 | 82.12 | 84.24 | +| cl-nagoya/sup-simcse-ja-large | 83.17 | 83.78 | 82.56 | +| cl-nagoya/ruri-large | 83.13 | 82.00 | 84.26 | +| cl-nagoya/ruri-base-v2 | 83.03 | 82.63 | 83.43 | +| pkshatech/GLuCoSE-base-ja-v2 | 82.95 | **84.95** | 80.96 | +| cl-nagoya/ruri-small-v2 | 82.91 | 83.88 | 81.93 | +| cl-nagoya/ruri-base | 82.88 | 82.32 | 83.43 | +| cl-nagoya/ruri-small | 82.79 | 83.45 | 82.13 | +| google/embeddinggemma-300m | 82.74 | 81.67 | 83.81 | +| OpenAI/text-embedding-3-large | 82.52 | 81.27 | 83.77 | +| cl-nagoya/sup-simcse-ja-base | 82.05 | 82.84 | 81.26 | +| cl-nagoya/ruri-v3-30m | 81.78 | 81.62 | 81.95 | +| sbintuitions/sarashina-embedding-v1-1b | 81.71 | 79.79 | 83.63 | +| cl-nagoya/ruri-v3-310m | 81.59 | 78.86 | 84.31 | +| pkshatech/RoSEtta-base-ja | 81.39 | 83.83 | 78.95 | +| cl-nagoya/ruri-v3-130m | 81.05 | 78.86 | 83.24 | +| cl-nagoya/ruri-v3-70m | 80.96 | 79.10 | 82.82 | +| intfloat/multilingual-e5-large | 80.86 | 79.85 | 81.86 | +| cl-nagoya/unsup-simcse-ja-large | 80.56 | 80.14 | 80.98 | +| intfloat/multilingual-e5-small | 80.46 | 82.00 | 78.92 | +| intfloat/multilingual-e5-base | 80.46 | 81.26 | 79.65 | +| hotchpotch/static-embedding-japanese | 80.16 | 82.51 | 77.81 | +| OpenAI/text-embedding-3-small | 79.46 | 80.83 | 78.08 | +| OpenAI/text-embedding-ada-002 | 79.02 | 79.09 | 78.94 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 79.00 | 79.86 | 78.14 | +| cl-nagoya/unsup-simcse-ja-base | 78.74 | 78.53 | 78.94 | +| pkshatech/GLuCoSE-base-ja | 78.68 | 74.90 | 82.46 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 78.29 | 78.76 | 77.82 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 77.96 | 79.14 | 76.77 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 77.75 | 78.93 | 76.57 | +| pkshatech/simcse-ja-bert-base-clcmlp | 76.81 | 73.11 | 80.51 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 76.70 | 77.76 | 75.63 | +| sentence-transformers/LaBSE | 76.56 | 77.01 | 76.12 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.56 | 72.04 | 81.08 | +| sentence-transformers/stsb-xlm-r-multilingual | 75.41 | 72.38 | 78.45 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 74.96 | 75.25 | 74.66 | +| colorfulscoop/sbert-base-ja | 70.41 | 66.56 | 74.25 | ## Classification -| Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) | -|:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------| -| sbintuitions/sarashina-embedding-v1-1b | **78.37** | 79.10 | **61.48** | 82.26 | 90.65 | -| OpenAI/text-embedding-3-large | 77.58 | 77.90 | 60.44 | 80.91 | **91.08** | -| cl-nagoya/ruri-large | 77.43 | 80.81 | 56.80 | **82.56** | 89.56 | -| pkshatech/GLuCoSE-base-ja | 76.82 | **82.44** | 58.07 | 78.85 | 87.94 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 76.61 | 79.95 | 57.48 | 80.26 | 88.75 | -| jinaai/jina-embeddings-v3 | 76.39 | 78.83 | 59.33 | 77.65 | 89.74 | -| cl-nagoya/ruri-small | 76.22 | 79.92 | 55.61 | 81.49 | 87.88 | -| cl-nagoya/ruri-base | 75.58 | 76.66 | 55.76 | 81.41 | 88.49 | -| cl-nagoya/unsup-simcse-ja-large | 74.66 | 76.79 | 55.37 | 79.13 | 87.36 | -| pkshatech/GLuCoSE-base-ja-v2 | 74.21 | 74.92 | 55.31 | 79.79 | 86.84 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 73.77 | 78.10 | 51.56 | 78.79 | 86.63 | -| cl-nagoya/sup-simcse-ja-large | 73.73 | 73.21 | 54.76 | 79.23 | 87.72 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 73.47 | 77.25 | 53.42 | 76.83 | 86.39 | -| cl-nagoya/sup-simcse-ja-base | 73.47 | 72.34 | 54.41 | 79.52 | 87.60 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 73.30 | 76.20 | 51.52 | 78.95 | 86.54 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 73.13 | 76.36 | 52.75 | 76.88 | 86.51 | -| cl-nagoya/unsup-simcse-ja-base | 73.07 | 73.30 | 53.93 | 79.07 | 85.97 | -| OpenAI/text-embedding-3-small | 73.06 | 70.01 | 55.92 | 77.66 | 88.67 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 72.90 | 77.62 | 50.86 | 77.19 | 85.93 | -| intfloat/multilingual-e5-large | 72.89 | 70.66 | 56.54 | 75.78 | 88.59 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 72.76 | 76.20 | 52.06 | 77.89 | 84.90 | -| sentence-transformers/LaBSE | 72.66 | 73.61 | 51.70 | 76.99 | 88.35 | -| pkshatech/RoSEtta-base-ja | 72.41 | 70.05 | 52.64 | 79.84 | 87.10 | -| sentence-transformers/stsb-xlm-r-multilingual | 71.84 | 75.65 | 51.32 | 74.28 | 86.10 | -| pkshatech/simcse-ja-bert-base-clcmlp | 71.30 | 67.49 | 50.85 | 79.67 | 87.20 | -| OpenAI/text-embedding-ada-002 | 69.75 | 64.42 | 53.13 | 74.57 | 86.89 | -| intfloat/multilingual-e5-base | 69.30 | 63.67 | 54.24 | 72.78 | 86.53 | -| colorfulscoop/sbert-base-ja | 69.07 | 72.21 | 47.95 | 72.52 | 83.62 | -| intfloat/multilingual-e5-small | 67.62 | 62.14 | 51.27 | 70.85 | 86.22 | +| Model | Avg. | amazon_counterfactual
(macro_f1) | amazon_review
(macro_f1) | jpn_sentiment
(macro_f1) | massive_intent
(macro_f1) | massive_scenario
(macro_f1) | sib200_jpn_cls
(macro_f1) | wrime_classification
(macro_f1) | +|:----------------------------------------------|:---------:|:-------------------------------------:|:-----------------------------:|:-----------------------------:|:------------------------------:|:--------------------------------:|:------------------------------:|:------------------------------------:| +| cl-nagoya/ruri-v3-310m | **77.65** | 80.09 | 60.72 | 95.31 | 81.76 | 89.01 | 88.13 | 48.53 | +| OpenAI/text-embedding-3-large | 77.27 | 77.90 | 60.44 | **96.89** | 80.91 | **91.08** | 87.85 | 45.84 | +| sbintuitions/sarashina-embedding-v1-1b | 77.20 | 79.66 | **62.02** | 95.03 | 81.21 | 90.16 | 82.63 | 49.70 | +| sbintuitions/sarashina-embedding-v2-1b | 77.14 | 79.81 | 61.39 | 93.51 | **83.69** | 90.23 | 81.48 | **49.87** | +| cl-nagoya/ruri-large | 76.25 | 79.50 | 56.85 | 93.56 | 82.10 | 90.03 | 85.26 | 46.45 | +| google/embeddinggemma-300m | 76.14 | 74.74 | 58.04 | 95.99 | 80.07 | 90.58 | 86.92 | 46.62 | +| cl-nagoya/ruri-large-v2 | 76.10 | 79.51 | 57.09 | 93.57 | 80.87 | 89.71 | 84.72 | 47.23 | +| cl-nagoya/ruri-v3-130m | 75.65 | 76.75 | 59.56 | 95.00 | 80.79 | 87.90 | 82.88 | 46.63 | +| cl-nagoya/ruri-base-v2 | 75.59 | 75.97 | 55.55 | 92.36 | 80.93 | 88.87 | **89.26** | 46.17 | +| cl-nagoya/ruri-base | 75.34 | 76.66 | 56.02 | 91.69 | 81.22 | 88.61 | 87.73 | 45.47 | +| pkshatech/GLuCoSE-base-ja | 75.02 | **82.03** | 57.93 | 92.89 | 78.52 | 87.71 | 77.24 | 48.82 | +| cl-nagoya/ruri-small | 74.83 | 80.55 | 55.41 | 88.86 | 81.08 | 88.00 | 83.97 | 45.95 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 74.53 | 79.72 | 58.02 | 91.99 | 80.16 | 88.78 | 77.31 | 45.73 | +| cl-nagoya/ruri-v3-70m | 74.45 | 81.81 | 57.98 | 93.39 | 78.92 | 87.83 | 76.87 | 44.38 | +| cl-nagoya/ruri-small-v2 | 74.12 | 77.67 | 55.60 | 88.64 | 82.00 | 88.16 | 81.57 | 45.23 | +| cl-nagoya/unsup-simcse-ja-large | 73.71 | 76.40 | 55.05 | 90.57 | 79.25 | 87.50 | 82.89 | 44.33 | +| pkshatech/GLuCoSE-base-ja-v2 | 73.52 | 75.28 | 55.19 | 89.24 | 78.73 | 87.14 | 85.83 | 43.23 | +| cl-nagoya/ruri-v3-30m | 73.35 | 75.60 | 55.71 | 92.63 | 78.31 | 86.72 | 81.40 | 43.11 | +| cl-nagoya/sup-simcse-ja-large | 72.74 | 72.61 | 54.56 | 89.42 | 79.23 | 87.71 | 80.43 | 45.26 | +| cl-nagoya/sup-simcse-ja-base | 72.72 | 71.93 | 54.54 | 91.01 | 80.11 | 87.63 | 81.92 | 41.88 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 72.59 | 77.67 | 53.48 | 89.28 | 76.79 | 86.24 | 83.77 | 40.89 | +| OpenAI/text-embedding-3-small | 72.43 | 70.01 | 55.92 | 89.97 | 77.66 | 88.67 | 84.72 | 40.05 | +| cl-nagoya/unsup-simcse-ja-base | 72.41 | 73.65 | 54.14 | 89.87 | 77.68 | 86.10 | 84.13 | 41.31 | +| intfloat/multilingual-e5-large | 72.30 | 69.70 | 57.64 | 95.55 | 74.01 | 88.71 | 78.11 | 42.38 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 72.06 | 77.70 | 51.46 | 88.45 | 78.72 | 86.40 | 83.50 | 38.15 | +| sentence-transformers/LaBSE | 71.85 | 74.74 | 51.63 | 89.52 | 77.09 | 88.39 | 81.47 | 40.11 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 71.83 | 76.55 | 52.73 | 88.22 | 77.22 | 86.25 | 81.45 | 40.38 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 71.81 | 77.79 | 51.11 | 87.82 | 77.97 | 86.34 | 85.06 | 36.56 | +| pkshatech/RoSEtta-base-ja | 71.70 | 70.21 | 52.62 | 87.28 | 79.59 | 86.96 | 84.01 | 41.24 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 71.48 | 74.30 | 51.97 | 89.69 | 77.83 | 84.60 | 83.82 | 38.15 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 71.46 | 76.40 | 51.65 | 87.86 | 78.15 | 86.44 | 81.80 | 37.93 | +| sentence-transformers/stsb-xlm-r-multilingual | 71.40 | 75.14 | 51.67 | 87.15 | 74.34 | 86.07 | 83.73 | 41.68 | +| pkshatech/simcse-ja-bert-base-clcmlp | 70.67 | 68.28 | 51.75 | 88.21 | 79.65 | 87.23 | 81.18 | 38.39 | +| intfloat/multilingual-e5-base | 69.70 | 64.29 | 54.17 | 92.32 | 73.19 | 86.78 | 78.50 | 38.65 | +| OpenAI/text-embedding-ada-002 | 69.39 | 64.42 | 53.13 | 88.76 | 74.57 | 86.89 | 80.39 | 37.57 | +| colorfulscoop/sbert-base-ja | 68.05 | 70.80 | 47.80 | 83.50 | 72.89 | 83.71 | 82.63 | 35.06 | +| intfloat/multilingual-e5-small | 67.77 | 58.66 | 51.21 | 87.73 | 71.34 | 86.77 | 81.78 | 36.91 | +| hotchpotch/static-embedding-japanese | 66.73 | 68.06 | 46.81 | 79.82 | 74.79 | 82.18 | 83.33 | 32.12 | ## Reranking -| Model | Avg. | esci
(ndcg@10) | -|:----------------------------------------------|:----------|:--------------------| -| sbintuitions/sarashina-embedding-v1-1b | **93.74** | **93.74** | -| OpenAI/text-embedding-3-large | 93.58 | 93.58 | -| OpenAI/text-embedding-ada-002 | 93.04 | 93.04 | -| intfloat/multilingual-e5-small | 93.03 | 93.03 | -| pkshatech/GLuCoSE-base-ja-v2 | 93.01 | 93.01 | -| cl-nagoya/ruri-small | 93.00 | 93.00 | -| cl-nagoya/ruri-large | 92.99 | 92.99 | -| intfloat/multilingual-e5-large | 92.96 | 92.96 | -| OpenAI/text-embedding-3-small | 92.92 | 92.92 | -| cl-nagoya/ruri-base | 92.91 | 92.91 | -| intfloat/multilingual-e5-base | 92.85 | 92.85 | -| jinaai/jina-embeddings-v3 | 92.71 | 92.71 | -| pkshatech/RoSEtta-base-ja | 92.69 | 92.69 | -| pkshatech/GLuCoSE-base-ja | 91.90 | 91.90 | -| cl-nagoya/sup-simcse-ja-base | 91.83 | 91.83 | -| sentence-transformers/LaBSE | 91.63 | 91.63 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 91.56 | 91.56 | -| pkshatech/simcse-ja-bert-base-clcmlp | 91.49 | 91.49 | -| cl-nagoya/sup-simcse-ja-large | 91.48 | 91.48 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 91.30 | 91.30 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 91.16 | 91.16 | -| cl-nagoya/unsup-simcse-ja-base | 91.16 | 91.16 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 91.01 | 91.01 | -| cl-nagoya/unsup-simcse-ja-large | 90.95 | 90.95 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 90.95 | 90.95 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 90.95 | 90.95 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 90.66 | 90.66 | -| sentence-transformers/stsb-xlm-r-multilingual | 90.20 | 90.20 | -| colorfulscoop/sbert-base-ja | 89.97 | 89.97 | +| Model | Avg. | esci
(ndcg@10) | jacwir_reranking
(ndcg@10) | jqara
(ndcg@10) | miracl_reranking
(ndcg@10) | mldr_reranking
(ndcg@10) | +|:----------------------------------------------|:---------:|:-------------------:|:-------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------:| +| sbintuitions/sarashina-embedding-v2-1b | **86.28** | 93.58 | 88.79 | **70.55** | 85.93 | 92.53 | +| cl-nagoya/ruri-v3-310m | 85.84 | 93.43 | 88.46 | 68.93 | 85.01 | 93.36 | +| cl-nagoya/ruri-v3-130m | 85.71 | 93.37 | 88.65 | 66.30 | **86.59** | 93.62 | +| sbintuitions/sarashina-embedding-v1-1b | 84.36 | **93.60** | 86.85 | 65.92 | 85.17 | 90.24 | +| cl-nagoya/ruri-v3-70m | 84.21 | 93.20 | 87.48 | 63.09 | 85.03 | 92.26 | +| cl-nagoya/ruri-large-v2 | 83.89 | 93.21 | 85.29 | 64.47 | 85.78 | 90.68 | +| OpenAI/text-embedding-3-large | 83.06 | 93.58 | 86.78 | 56.89 | 83.80 | **94.24** | +| intfloat/multilingual-e5-large | 83.01 | 93.31 | **90.37** | 56.14 | 86.31 | 88.91 | +| cl-nagoya/ruri-v3-30m | 82.93 | 93.06 | 87.61 | 57.47 | 83.52 | 92.97 | +| pkshatech/GLuCoSE-base-ja-v2 | 82.63 | 93.02 | 88.27 | 60.70 | 82.44 | 88.71 | +| cl-nagoya/ruri-base-v2 | 82.46 | 93.17 | 85.76 | 60.66 | 84.26 | 88.47 | +| cl-nagoya/ruri-small-v2 | 82.30 | 93.20 | 88.18 | 56.70 | 83.33 | 90.09 | +| cl-nagoya/ruri-large | 81.26 | 92.99 | 86.61 | 59.59 | 80.23 | 86.91 | +| pkshatech/RoSEtta-base-ja | 81.25 | 92.68 | 86.83 | 57.92 | 80.38 | 88.45 | +| google/embeddinggemma-300m | 80.93 | 93.26 | 86.72 | 52.09 | 82.38 | 90.19 | +| cl-nagoya/ruri-base | 80.31 | 92.92 | 87.24 | 54.15 | 79.22 | 88.01 | +| intfloat/multilingual-e5-small | 80.09 | 92.98 | 89.99 | 49.28 | 81.78 | 86.41 | +| cl-nagoya/ruri-small | 79.98 | 93.01 | 87.67 | 53.26 | 77.84 | 88.14 | +| intfloat/multilingual-e5-base | 79.46 | 92.90 | 88.65 | 47.61 | 81.97 | 86.15 | +| OpenAI/text-embedding-3-small | 77.29 | 92.92 | 84.72 | 38.58 | 77.61 | 92.61 | +| hotchpotch/static-embedding-japanese | 77.09 | 91.87 | 80.96 | 47.06 | 72.01 | 93.55 | +| OpenAI/text-embedding-ada-002 | 75.63 | 93.04 | 83.91 | 37.54 | 72.83 | 90.83 | +| pkshatech/GLuCoSE-base-ja | 72.37 | 91.82 | 74.54 | 30.24 | 77.82 | 87.42 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 71.88 | 91.30 | 65.14 | 44.96 | 71.21 | 86.79 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 70.59 | 90.93 | 61.45 | 42.47 | 70.65 | 87.42 | +| cl-nagoya/sup-simcse-ja-base | 70.36 | 91.84 | 64.27 | 37.48 | 70.88 | 87.34 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 69.92 | 91.17 | 65.41 | 38.39 | 70.19 | 84.42 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 69.81 | 91.51 | 67.45 | 36.04 | 68.68 | 85.38 | +| cl-nagoya/sup-simcse-ja-large | 68.76 | 91.50 | 56.15 | 38.30 | 71.26 | 86.60 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 68.58 | 90.92 | 60.51 | 36.84 | 69.31 | 85.31 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 68.15 | 90.67 | 58.16 | 36.66 | 69.09 | 86.15 | +| pkshatech/simcse-ja-bert-base-clcmlp | 68.02 | 91.27 | 57.45 | 31.74 | 72.12 | 87.50 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 67.73 | 90.95 | 59.81 | 37.20 | 67.90 | 82.81 | +| cl-nagoya/unsup-simcse-ja-large | 67.39 | 90.95 | 54.17 | 38.78 | 70.02 | 83.04 | +| sentence-transformers/LaBSE | 67.01 | 91.47 | 67.85 | 24.62 | 69.28 | 81.84 | +| cl-nagoya/unsup-simcse-ja-base | 66.20 | 91.18 | 51.54 | 32.19 | 69.96 | 86.12 | +| colorfulscoop/sbert-base-ja | 59.38 | 89.97 | 37.15 | 22.21 | 65.03 | 82.55 | +| sentence-transformers/stsb-xlm-r-multilingual | 57.93 | 89.72 | 39.21 | 18.51 | 65.36 | 76.88 | ## Clustering -| Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) | -|:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------| -| cl-nagoya/ruri-base | **54.16** | 54.27 | **54.04** | -| sbintuitions/sarashina-embedding-v1-1b | 53.86 | 56.42 | 51.29 | -| OpenAI/text-embedding-3-large | 53.32 | 57.09 | 49.55 | -| pkshatech/RoSEtta-base-ja | 53.23 | **58.62** | 47.85 | -| cl-nagoya/ruri-large | 51.82 | 51.39 | 52.25 | -| cl-nagoya/sup-simcse-ja-base | 51.79 | 52.67 | 50.91 | -| jinaai/jina-embeddings-v3 | 51.46 | 54.72 | 48.19 | -| intfloat/multilingual-e5-large | 51.24 | 57.13 | 45.34 | -| cl-nagoya/ruri-small | 51.19 | 50.96 | 51.41 | -| OpenAI/text-embedding-3-small | 51.06 | 54.57 | 47.55 | -| cl-nagoya/sup-simcse-ja-large | 50.56 | 50.75 | 50.38 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 50.33 | 46.77 | 53.89 | -| pkshatech/GLuCoSE-base-ja | 49.78 | 49.89 | 49.68 | -| pkshatech/GLuCoSE-base-ja-v2 | 48.65 | 51.52 | 45.78 | -| cl-nagoya/unsup-simcse-ja-large | 48.41 | 50.90 | 45.92 | -| OpenAI/text-embedding-ada-002 | 48.30 | 49.67 | 46.92 | -| intfloat/multilingual-e5-base | 48.26 | 55.03 | 41.49 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 48.25 | 53.20 | 43.31 | -| pkshatech/simcse-ja-bert-base-clcmlp | 47.53 | 44.77 | 50.30 | -| intfloat/multilingual-e5-small | 46.91 | 54.70 | 39.12 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 46.68 | 53.02 | 40.35 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 45.81 | 48.45 | 43.17 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 44.95 | 52.62 | 37.28 | -| sentence-transformers/LaBSE | 44.88 | 48.29 | 41.47 | -| colorfulscoop/sbert-base-ja | 44.81 | 42.99 | 46.64 | -| cl-nagoya/unsup-simcse-ja-base | 44.77 | 52.23 | 37.31 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 44.22 | 49.67 | 38.77 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 43.11 | 41.04 | 45.18 | -| sentence-transformers/stsb-xlm-r-multilingual | 27.46 | 24.49 | 30.43 | - -## PairClassification -| Model | Avg. | paws_x_ja
(binary_f1) | -|:----------------------------------------------|:----------|:---------------------------| -| pkshatech/GLuCoSE-base-ja | **66.39** | **66.39** | -| cl-nagoya/sup-simcse-ja-base | 62.57 | 62.57 | -| cl-nagoya/sup-simcse-ja-large | 62.51 | 62.51 | -| cl-nagoya/unsup-simcse-ja-large | 62.49 | 62.49 | -| cl-nagoya/unsup-simcse-ja-base | 62.44 | 62.44 | -| pkshatech/simcse-ja-bert-base-clcmlp | 62.40 | 62.40 | -| OpenAI/text-embedding-ada-002 | 62.40 | 62.40 | -| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 62.38 | 62.38 | -| cl-nagoya/ruri-base | 62.38 | 62.38 | -| oshizo/sbert-jsnli-luke-japanese-base-lite | 62.38 | 62.38 | -| MU-Kindai/Japanese-DiffCSE-BERT-base | 62.38 | 62.38 | -| jinaai/jina-embeddings-v3 | 62.37 | 62.37 | -| pkshatech/GLuCoSE-base-ja-v2 | 62.37 | 62.37 | -| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 62.37 | 62.37 | -| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 62.35 | 62.35 | -| OpenAI/text-embedding-3-large | 62.35 | 62.35 | -| MU-Kindai/Japanese-MixCSE-BERT-base | 62.33 | 62.33 | -| sentence-transformers/LaBSE | 62.33 | 62.33 | -| colorfulscoop/sbert-base-ja | 62.31 | 62.31 | -| cl-nagoya/ruri-large | 62.29 | 62.29 | -| OpenAI/text-embedding-3-small | 62.27 | 62.27 | -| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 62.27 | 62.27 | -| intfloat/multilingual-e5-base | 62.26 | 62.26 | -| sentence-transformers/stsb-xlm-r-multilingual | 62.20 | 62.20 | -| intfloat/multilingual-e5-small | 62.19 | 62.19 | -| intfloat/multilingual-e5-large | 62.15 | 62.15 | -| cl-nagoya/ruri-small | 62.11 | 62.11 | -| sbintuitions/sarashina-embedding-v1-1b | 62.00 | 62.00 | -| pkshatech/RoSEtta-base-ja | 61.74 | 61.74 | +| Model | Avg. | livedoor_news
(v_measure_score) | mewsc16
(v_measure_score) | sib200_jpn_clust
(v_measure_score) | +|:----------------------------------------------|:---------:|:------------------------------------:|:------------------------------:|:---------------------------------------:| +| cl-nagoya/sup-simcse-ja-base | **52.57** | 55.11 | **53.39** | 49.21 | +| sbintuitions/sarashina-embedding-v2-1b | 52.56 | 57.41 | 51.67 | 48.59 | +| OpenAI/text-embedding-3-large | 51.82 | 57.09 | 49.55 | 48.83 | +| cl-nagoya/ruri-v3-130m | 51.13 | 54.36 | 48.84 | 50.20 | +| cl-nagoya/ruri-large-v2 | 50.88 | 55.62 | 50.97 | 46.06 | +| intfloat/multilingual-e5-large | 50.58 | 51.58 | 46.81 | **53.35** | +| cl-nagoya/ruri-v3-310m | 50.52 | **58.56** | 48.60 | 44.41 | +| sbintuitions/sarashina-embedding-v1-1b | 50.30 | 56.03 | 50.69 | 44.19 | +| cl-nagoya/sup-simcse-ja-large | 50.12 | 53.38 | 51.26 | 45.74 | +| intfloat/multilingual-e5-base | 50.12 | 53.79 | 49.44 | 47.13 | +| cl-nagoya/ruri-small-v2 | 49.97 | 52.61 | 49.47 | 47.82 | +| cl-nagoya/ruri-v3-70m | 49.95 | 54.92 | 47.74 | 47.20 | +| cl-nagoya/ruri-large | 49.93 | 54.44 | 50.59 | 44.76 | +| cl-nagoya/ruri-v3-30m | 49.90 | 53.69 | 47.96 | 48.04 | +| cl-nagoya/ruri-small | 49.59 | 52.90 | 49.37 | 46.51 | +| google/embeddinggemma-300m | 49.48 | 55.33 | 50.55 | 42.55 | +| pkshatech/simcse-ja-bert-base-clcmlp | 49.45 | 49.11 | 47.02 | 52.21 | +| intfloat/multilingual-e5-small | 49.29 | 51.94 | 52.34 | 43.59 | +| cl-nagoya/ruri-base | 49.10 | 56.69 | 52.05 | 38.55 | +| OpenAI/text-embedding-3-small | 48.91 | 54.57 | 47.55 | 44.59 | +| OpenAI/text-embedding-ada-002 | 48.78 | 49.67 | 46.92 | 49.74 | +| oshizo/sbert-jsnli-luke-japanese-base-lite | 48.75 | 51.70 | 51.52 | 43.03 | +| pkshatech/GLuCoSE-base-ja-v2 | 48.19 | 54.46 | 46.12 | 43.98 | +| pkshatech/GLuCoSE-base-ja | 47.12 | 50.41 | 49.52 | 41.43 | +| cl-nagoya/ruri-base-v2 | 46.84 | 54.38 | 50.61 | 35.53 | +| pkshatech/RoSEtta-base-ja | 44.88 | 48.89 | 45.16 | 40.61 | +| MU-Kindai/Japanese-SimCSE-BERT-large-sup | 44.54 | 51.30 | 46.27 | 36.04 | +| cl-nagoya/unsup-simcse-ja-large | 43.52 | 51.48 | 44.44 | 34.65 | +| MU-Kindai/Japanese-MixCSE-BERT-base | 43.45 | 48.56 | 43.20 | 38.60 | +| MU-Kindai/Japanese-SimCSE-BERT-base-sup | 42.86 | 45.84 | 44.08 | 38.67 | +| MU-Kindai/Japanese-SimCSE-BERT-large-unsup | 42.02 | 51.59 | 42.68 | 31.78 | +| cl-nagoya/unsup-simcse-ja-base | 41.29 | 50.65 | 39.58 | 33.63 | +| MU-Kindai/Japanese-DiffCSE-BERT-base | 39.93 | 46.01 | 39.22 | 34.56 | +| sentence-transformers/LaBSE | 39.82 | 49.08 | 41.78 | 28.59 | +| MU-Kindai/Japanese-SimCSE-BERT-base-unsup | 39.27 | 48.79 | 42.61 | 26.42 | +| colorfulscoop/sbert-base-ja | 39.04 | 40.60 | 46.18 | 30.36 | +| hotchpotch/static-embedding-japanese | 35.91 | 51.44 | 34.81 | 21.47 | +| sentence-transformers/stsb-xlm-r-multilingual | 27.67 | 26.62 | 32.05 | 24.34 | diff --git a/make_leaderboard.py b/make_leaderboard.py index 0e43ccf..5d472eb 100644 --- a/make_leaderboard.py +++ b/make_leaderboard.py @@ -9,14 +9,44 @@ "amazon_review_classification": "amazon_review", "massive_intent_classification": "massive_intent", "massive_scenario_classification": "massive_scenario", + "japanese_sentiment_classification": "jpn_sentiment", + "sib200_japanese_classification": "sib200_jpn_cls", + "sib200_japanese_clustering": "sib200_jpn_clust", + "nlp_journal_abs_article": "nlp_abs_article", + "nlp_journal_abs_intro": "nlp_abs_intro", + "nlp_journal_title_abs": "nlp_title_abs", + "nlp_journal_title_intro": "nlp_title_intro", } -TASK_ORDER = ["Retrieval", "STS", "Classification", "Reranking", "Clustering", "PairClassification"] +TASK_ORDER = ["Retrieval", "STS", "Classification", "Reranking", "Clustering"] SUMMARY_KEY = "Summary" """ Collects the results from the results folder. """ +# Load reference structure from sbintuitions/sarashina-embedding-v1-1b/summary.json +reference_file = Path("docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json") +with open(reference_file) as f: + reference_structure = json.load(f) + +# Extract the expected structure +expected_structure = {} +for task_name, task_results in reference_structure.items(): + expected_structure[task_name] = set(task_results.keys()) + + +def has_same_structure(summary: dict, expected: dict) -> bool: + """Check if summary has exactly the same structure as expected.""" + if set(summary.keys()) != set(expected.keys()): + return False + + for task_name, datasets in expected.items(): + if set(summary[task_name].keys()) != datasets: + return False + + return True + + # {task_name: {model_signature: {(dataset_name, metric_name): score}}} all_results: dict[str, dict[str, dict[str, float]]] = defaultdict(lambda: defaultdict(dict)) for summary_file in Path("docs/results").rglob("summary.json"): @@ -26,6 +56,13 @@ with open(summary_file) as f: summary = json.load(f) + # Skip models that don't have the same structure as reference + if not has_same_structure(summary, expected_structure): + org_name = summary_file.parent.parent.name + model_name = summary_file.parent.name + print(f"Skipping {org_name}/{model_name}: different structure") + continue + org_name = summary_file.parent.parent.name model_name = summary_file.parent.name model_signature = f"{org_name}/{model_name}" @@ -56,17 +93,24 @@ def format_score(score: float) -> str: # format to markdown table dataset_keys = list(task_results[next(iter(task_results))].keys()) if task_name == SUMMARY_KEY: - dataset_keys = TASK_ORDER + # Only include existing tasks in the summary + dataset_keys = [task for task in TASK_ORDER if task in all_results] header = ["Model", AVG_COLUMN_NAME, *dataset_keys] table_list: list[list[str | float]] = [] for model_signature, dataset_scores in task_results.items(): + # Skip models that don't have all required datasets + if not all(k in dataset_scores for k in dataset_keys): + continue + model_scores = [dataset_scores[k] for k in dataset_keys] if task_name == SUMMARY_KEY: scores_by_dataset = [] for _task_name, _task_results in all_results.items(): - if _task_name != SUMMARY_KEY: + if _task_name != SUMMARY_KEY and model_signature in _task_results: scores_by_dataset.extend(list(_task_results[model_signature].values())) + if not scores_by_dataset: # Skip if no scores available + continue average_score = sum(scores_by_dataset) / len(scores_by_dataset) else: average_score = sum(model_scores) / len(model_scores) @@ -88,7 +132,9 @@ def format_score(score: float) -> str: # add header table_list.insert(0, ["Model", AVG_COLUMN_NAME, *dataset_keys]) - markdown_table = tabulate(table_list, headers="firstrow", tablefmt="pipe") + # Set alignment: left for model names, center for all numeric columns + col_alignment = ["left"] + ["center"] * (len(dataset_keys) + 1) + markdown_table = tabulate(table_list, headers="firstrow", tablefmt="pipe", colalign=col_alignment) markdown_tables[task_name] = markdown_table """ @@ -100,6 +146,8 @@ def format_score(score: float) -> str: "This leaderboard shows the results stored under `docs/results`. The scores are all multiplied by 100.\n\n" ) for task_name in [SUMMARY_KEY, *TASK_ORDER]: + if task_name not in markdown_tables: + continue markdown_table = markdown_tables[task_name] f.write(f"## {task_name}\n") diff --git a/src/jmteb/__main__.py b/src/jmteb/__main__.py index ff10884..2dc9478 100644 --- a/src/jmteb/__main__.py +++ b/src/jmteb/__main__.py @@ -119,6 +119,7 @@ def main( ) if args.log_predictions: + logger.info("Prediction logging activated.") for k, v in args.evaluators.items(): if hasattr(v, "log_predictions"): args.evaluators[k].log_predictions = True diff --git a/src/jmteb/configs/jmteb.jsonnet b/src/jmteb/configs/jmteb.jsonnet index 66fd2dc..b27d021 100644 --- a/src/jmteb/configs/jmteb.jsonnet +++ b/src/jmteb/configs/jmteb.jsonnet @@ -3,14 +3,16 @@ (import './tasks/amazon_counterfactual_classification.jsonnet') + (import './tasks/massive_intent_classification.jsonnet') + (import './tasks/massive_scenario_classification.jsonnet') + +(import './tasks/japanese_sentiment_classification.jsonnet') + +(import './tasks/sib200_japanese_classification.jsonnet') + +(import './tasks/wrime_classification.jsonnet') + // Clustering (import './tasks/livedoor_news.jsonnet') + (import './tasks/mewsc16.jsonnet') + +(import './tasks/sib200_japanese_clustering.jsonnet') + // STS (import './tasks/jsts.jsonnet') + (import './tasks/jsick.jsonnet') + -// Pair Classification -(import './tasks/paws_x_ja.jsonnet') + // Retrieval (import './tasks/jagovfaqs_22k.jsonnet') + (import './tasks/mrtydi.jsonnet') + @@ -18,5 +20,14 @@ (import './tasks/nlp_journal_title_abs.jsonnet') + (import './tasks/nlp_journal_title_intro.jsonnet') + (import './tasks/nlp_journal_abs_intro.jsonnet') + +(import './tasks/nlp_journal_abs_article.jsonnet') + +(import './tasks/jacwir_retrieval.jsonnet') + +(import './tasks/miracl_retrieval.jsonnet') + +(import './tasks/mldr_retrieval.jsonnet') + +(import './tasks/mintaka_retrieval.jsonnet') + // Reranking -(import './tasks/esci.jsonnet') \ No newline at end of file +(import './tasks/esci.jsonnet') + +(import './tasks/jqara.jsonnet') + +(import './tasks/jacwir_reranking.jsonnet') + +(import './tasks/miracl_reranking.jsonnet') + +(import './tasks/mldr_reranking.jsonnet') \ No newline at end of file diff --git a/src/jmteb/configs/tasks/jacwir_reranking.jsonnet b/src/jmteb/configs/tasks/jacwir_reranking.jsonnet new file mode 100644 index 0000000..eb41d67 --- /dev/null +++ b/src/jmteb/configs/tasks/jacwir_reranking.jsonnet @@ -0,0 +1,31 @@ +{ + jacwir_reranking: { + class_path: 'RerankingEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'jacwir-reranking-query', + }, + }, + test_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'jacwir-reranking-query', + }, + }, + doc_dataset: { + class_path: 'HfRerankingDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'jacwir-reranking-corpus', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet b/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet new file mode 100644 index 0000000..8cdb416 --- /dev/null +++ b/src/jmteb/configs/tasks/jacwir_retrieval.jsonnet @@ -0,0 +1,32 @@ +{ + jacwir_retrieval: { + class_path: 'RetrievalEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'jacwir-retrieval-query', + }, + }, + test_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'jacwir-retrieval-query', + }, + }, + doc_dataset: { + class_path: 'HfRetrievalDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'jacwir-retrieval-corpus', + }, + }, + "doc_chunk_size":10000 + }, + }, +} diff --git a/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet b/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet new file mode 100644 index 0000000..f5a847c --- /dev/null +++ b/src/jmteb/configs/tasks/japanese_sentiment_classification.jsonnet @@ -0,0 +1,31 @@ +{ + japanese_sentiment_classification: { + class_path: 'ClassificationEvaluator', + init_args: { + train_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'train', + name: 'japanese_sentiment_classification', + }, + }, + val_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'japanese_sentiment_classification', + }, + }, + test_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'japanese_sentiment_classification', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/jqara.jsonnet b/src/jmteb/configs/tasks/jqara.jsonnet new file mode 100644 index 0000000..1c0ba64 --- /dev/null +++ b/src/jmteb/configs/tasks/jqara.jsonnet @@ -0,0 +1,31 @@ +{ + jqara: { + class_path: 'RerankingEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'jqara-query', + }, + }, + test_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'jqara-query', + }, + }, + doc_dataset: { + class_path: 'HfRerankingDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'jqara-corpus', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet b/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet new file mode 100644 index 0000000..6b17949 --- /dev/null +++ b/src/jmteb/configs/tasks/mintaka_retrieval.jsonnet @@ -0,0 +1,32 @@ +{ + mintaka_retrieval: { + class_path: 'RetrievalEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'mintaka-retrieval-query', + }, + }, + test_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'mintaka-retrieval-query', + }, + }, + doc_dataset: { + class_path: 'HfRetrievalDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'mintaka-retrieval-corpus', + }, + }, + "doc_chunk_size":10000 + }, + }, +} diff --git a/src/jmteb/configs/tasks/miracl_reranking.jsonnet b/src/jmteb/configs/tasks/miracl_reranking.jsonnet new file mode 100644 index 0000000..b91a341 --- /dev/null +++ b/src/jmteb/configs/tasks/miracl_reranking.jsonnet @@ -0,0 +1,31 @@ +{ + miracl_reranking: { + class_path: 'RerankingEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'miracl-reranking-query', + }, + }, + test_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'miracl-reranking-query', + }, + }, + doc_dataset: { + class_path: 'HfRerankingDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'miracl-reranking-corpus', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/miracl_retrieval.jsonnet b/src/jmteb/configs/tasks/miracl_retrieval.jsonnet new file mode 100644 index 0000000..9b73f4f --- /dev/null +++ b/src/jmteb/configs/tasks/miracl_retrieval.jsonnet @@ -0,0 +1,32 @@ +{ + miracl_retrieval: { + class_path: 'RetrievalEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'miracl-retrieval-query', + }, + }, + test_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'miracl-retrieval-query', + }, + }, + doc_dataset: { + class_path: 'HfRetrievalDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'miracl-retrieval-corpus', + }, + }, + "doc_chunk_size":10000 + }, + }, +} diff --git a/src/jmteb/configs/tasks/mldr_reranking.jsonnet b/src/jmteb/configs/tasks/mldr_reranking.jsonnet new file mode 100644 index 0000000..1cbc025 --- /dev/null +++ b/src/jmteb/configs/tasks/mldr_reranking.jsonnet @@ -0,0 +1,31 @@ +{ + mldr_reranking: { + class_path: 'RerankingEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'mldr-reranking-query', + }, + }, + test_query_dataset: { + class_path: 'HfRerankingQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'mldr-reranking-query', + }, + }, + doc_dataset: { + class_path: 'HfRerankingDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'mldr-reranking-corpus', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/mldr_retrieval.jsonnet b/src/jmteb/configs/tasks/mldr_retrieval.jsonnet new file mode 100644 index 0000000..71c0bee --- /dev/null +++ b/src/jmteb/configs/tasks/mldr_retrieval.jsonnet @@ -0,0 +1,32 @@ +{ + mldr_retrieval: { + class_path: 'RetrievalEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'mldr-retrieval-query', + }, + }, + test_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'mldr-retrieval-query', + }, + }, + doc_dataset: { + class_path: 'HfRetrievalDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'mldr-retrieval-corpus', + }, + }, + "doc_chunk_size":10000 + }, + }, +} diff --git a/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet b/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet new file mode 100644 index 0000000..f2c175f --- /dev/null +++ b/src/jmteb/configs/tasks/nlp_journal_abs_article.jsonnet @@ -0,0 +1,31 @@ +{ + nlp_journal_abs_article: { + class_path: 'RetrievalEvaluator', + init_args: { + val_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'nlp_journal_abs_article-query', + }, + }, + test_query_dataset: { + class_path: 'HfRetrievalQueryDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'nlp_journal_abs_article-query', + }, + }, + doc_dataset: { + class_path: 'HfRetrievalDocDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'corpus', + name: 'nlp_journal_abs_article-corpus', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet b/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet new file mode 100644 index 0000000..852505f --- /dev/null +++ b/src/jmteb/configs/tasks/sib200_japanese_classification.jsonnet @@ -0,0 +1,31 @@ +{ + sib200_japanese_classification: { + class_path: 'ClassificationEvaluator', + init_args: { + train_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'train', + name: 'sib200_japanese_classification', + }, + }, + val_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'sib200_japanese_classification', + }, + }, + test_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'sib200_japanese_classification', + }, + }, + }, + }, +} diff --git a/src/jmteb/configs/tasks/paws_x_ja.jsonnet b/src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet similarity index 53% rename from src/jmteb/configs/tasks/paws_x_ja.jsonnet rename to src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet index ee57b72..762d34a 100644 --- a/src/jmteb/configs/tasks/paws_x_ja.jsonnet +++ b/src/jmteb/configs/tasks/sib200_japanese_clustering.jsonnet @@ -1,21 +1,21 @@ { - paws_x_ja: { - class_path: 'PairClassificationEvaluator', + sib200_japanese_clustering: { + class_path: 'ClusteringEvaluator', init_args: { val_dataset: { - class_path: 'HfPairClassificationDataset', + class_path: 'HfClusteringDataset', init_args: { path: 'sbintuitions/JMTEB', split: 'validation', - name: 'paws_x_ja', + name: 'sib200_japanese_clustering', }, }, test_dataset: { - class_path: 'HfPairClassificationDataset', + class_path: 'HfClusteringDataset', init_args: { path: 'sbintuitions/JMTEB', split: 'test', - name: 'paws_x_ja', + name: 'sib200_japanese_clustering', }, }, }, diff --git a/src/jmteb/configs/tasks/wrime_classification.jsonnet b/src/jmteb/configs/tasks/wrime_classification.jsonnet new file mode 100644 index 0000000..7fb68b7 --- /dev/null +++ b/src/jmteb/configs/tasks/wrime_classification.jsonnet @@ -0,0 +1,31 @@ +{ + wrime_classification: { + class_path: 'ClassificationEvaluator', + init_args: { + train_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'train', + name: 'wrime_classification', + }, + }, + val_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'validation', + name: 'wrime_classification', + }, + }, + test_dataset: { + class_path: 'HfClassificationDataset', + init_args: { + path: 'sbintuitions/JMTEB', + split: 'test', + name: 'wrime_classification', + }, + }, + }, + }, +} diff --git a/src/jmteb/embedders/__init__.py b/src/jmteb/embedders/__init__.py index f28f038..5a3e19c 100644 --- a/src/jmteb/embedders/__init__.py +++ b/src/jmteb/embedders/__init__.py @@ -2,6 +2,8 @@ from jmteb.embedders.data_parallel_sbert_embedder import ( DataParallelSentenceBertEmbedder, ) +from jmteb.embedders.gemma_embedder import GemmaEmbedder from jmteb.embedders.openai_embedder import OpenAIEmbedder +from jmteb.embedders.plamo_embedder import PlamoEmbedder from jmteb.embedders.sbert_embedder import SentenceBertEmbedder from jmteb.embedders.transformers_embedder import TransformersEmbedder diff --git a/src/jmteb/embedders/base.py b/src/jmteb/embedders/base.py index ea078f1..42a5d54 100644 --- a/src/jmteb/embedders/base.py +++ b/src/jmteb/embedders/base.py @@ -144,3 +144,17 @@ def set_output_tensor(self): def set_output_numpy(self): self.convert_to_numpy = True self.convert_to_tensor = False + + def set_max_seq_length(self, max_seq_length: int | None = None) -> None: + if hasattr(self, "max_seq_length"): + self.max_seq_length = max_seq_length + else: + logger.warning("Embedder doesn't have a `max_seq_length` attribute!") + + def reset_max_seq_length(self): + orig_max_seq_length = getattr(self, "_orig_max_length", None) + if not orig_max_seq_length: + logger.warning("Failed to reset `max_seq_length`!") + else: + logger.info(f"Set `max_seq_length` to model default: {orig_max_seq_length}") + self.max_seq_length = orig_max_seq_length diff --git a/src/jmteb/embedders/data_parallel_sbert_embedder.py b/src/jmteb/embedders/data_parallel_sbert_embedder.py index 7416fe4..5d932e2 100644 --- a/src/jmteb/embedders/data_parallel_sbert_embedder.py +++ b/src/jmteb/embedders/data_parallel_sbert_embedder.py @@ -201,6 +201,7 @@ def __init__( ) self.dp_model = DPSentenceTransformer(sbert_model=model) self.model = self.dp_model.sbert + self._orig_max_length = self.model.max_seq_length if max_seq_length: self.model.max_seq_length = max_seq_length self.initital_batch_size = batch_size @@ -258,3 +259,7 @@ def _add_eos_func(self, text: str | list[str]) -> str | list[str]: def get_output_dim(self) -> int: return self.model.get_sentence_embedding_dimension() + + def reset_max_seq_length(self): + logger.info(f"Reset `max_seq_length` to {self._orig_max_length}") + self.model.max_seq_length = self._orig_max_length diff --git a/src/jmteb/embedders/gemma_embedder.py b/src/jmteb/embedders/gemma_embedder.py new file mode 100644 index 0000000..5949845 --- /dev/null +++ b/src/jmteb/embedders/gemma_embedder.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import numpy as np +import torch +from loguru import logger +from sentence_transformers import SentenceTransformer + +from jmteb.embedders.base import TextEmbedder + + +class GemmaEmbedder(TextEmbedder): + """ + Google EmbeddingGemma model embedder using SentenceTransformers. + + This class supports the EmbeddingGemma models from Google (e.g., embeddinggemma-300m). + It uses SentenceTransformers to load the model and provides specialized encode_query + and encode_document methods for optimal performance in different use cases. + """ + + def __init__( + self, + model_name_or_path: str = "google/embeddinggemma-300m", + batch_size: int = 32, + device: str | None = None, + normalize_embeddings: bool = True, + max_seq_length: int | None = None, + query_mode: bool = False, + add_eos: bool = False, + truncate_dim: int | None = None, + model_kwargs: dict | None = None, + tokenizer_kwargs: dict | None = None, + ) -> None: + """ + Initialize the EmbeddingGemma embedder using SentenceTransformers. + + Args: + model_name_or_path: Path or name of the EmbeddingGemma model + batch_size: Batch size for encoding + device: Device to use ('cuda', 'cpu', or None for auto) + normalize_embeddings: Whether to normalize embeddings (recommended for EmbeddingGemma) + max_seq_length: Maximum sequence length (default: model's max, typically 2048) + query_mode: Whether to use query encoding mode by default + add_eos: Whether to add EOS token to inputs + truncate_dim: Truncate embeddings to this dimension (supports 768, 512, 256, 128) + model_kwargs: Additional kwargs for model loading + tokenizer_kwargs: Additional kwargs for tokenizer loading + """ + model_kwargs = self._model_kwargs_parser(model_kwargs or {}) + + # Initialize SentenceTransformer + self.model = SentenceTransformer( + model_name_or_path, + trust_remote_code=True, + truncate_dim=truncate_dim, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs or {}, + ) + + # Store original max length and set new one if provided + self._orig_max_length = self.model.max_seq_length + if max_seq_length: + self.model.max_seq_length = max_seq_length + + self.batch_size = batch_size + self.device = device + self.normalize_embeddings = normalize_embeddings + self.max_seq_length = getattr(self.model, "max_seq_length", None) + self.add_eos = add_eos + self.query_mode = query_mode + + # Set output format based on model kwargs + if model_kwargs and "torch_dtype" in model_kwargs: + self.set_output_tensor() + else: + self.set_output_numpy() + + logger.info(f"Loaded EmbeddingGemma model: {model_name_or_path}") + logger.info(f"Model device: {self.model.device}, Max seq length: {self.max_seq_length}") + + def encode(self, text: str | list[str], prefix: str | None = None, **kwargs) -> np.ndarray | torch.Tensor: + """ + Encode text into embeddings using EmbeddingGemma's specialized methods. + + This method is compatible with the base TextEmbedder interface and works + seamlessly with batch_encode_with_cache. + + Args: + text: Input text(s) to encode + prefix: Prefix to add to texts + **kwargs: Additional arguments (supports query_mode for specialized encoding) + + Returns: + Embeddings as numpy array or torch tensor + """ + if isinstance(text, str): + text = [text] + text_was_str = True + else: + text_was_str = False + + # Check for query_mode in kwargs, otherwise use instance default + use_query_mode = kwargs.get("query_mode", self.query_mode) + + # Apply prefix if provided + if prefix: + text = [prefix + t for t in text] + + if self.add_eos: + text = self._add_eos_func(text) + + # Use specialized encoding methods if available + if hasattr(self.model, "encode_query") and hasattr(self.model, "encode_document"): + if use_query_mode: + embeddings = self.model.encode_query(text) + else: + embeddings = self.model.encode_document(text) + + # Convert to appropriate format + if self.convert_to_numpy and isinstance(embeddings, torch.Tensor): + embeddings = embeddings.cpu().numpy() + elif not self.convert_to_numpy and isinstance(embeddings, np.ndarray): + embeddings = torch.from_numpy(embeddings) + else: + # Fallback to standard SentenceTransformer encode method + embeddings = self.model.encode( + text, + convert_to_numpy=self.convert_to_numpy, + convert_to_tensor=self.convert_to_tensor, + batch_size=self.batch_size, + device=self.device, + normalize_embeddings=self.normalize_embeddings, + **kwargs, + ) + + if text_was_str: + if isinstance(embeddings, np.ndarray) and embeddings.ndim > 1: + embeddings = embeddings[0] + elif isinstance(embeddings, torch.Tensor) and embeddings.ndim > 1: + embeddings = embeddings[0] + + return embeddings + + def encode_queries( + self, queries: str | list[str], prefix: str | None = None, **kwargs + ) -> np.ndarray | torch.Tensor: + """ + Convenience method to encode queries using query mode. + + Args: + queries: Query text(s) to encode + prefix: Prefix to add + **kwargs: Additional arguments + + Returns: + Query embeddings + """ + return self.encode(queries, prefix=prefix, query_mode=True, **kwargs) + + def encode_documents( + self, documents: str | list[str], prefix: str | None = None, **kwargs + ) -> np.ndarray | torch.Tensor: + """ + Convenience method to encode documents using document mode. + + Args: + documents: Document text(s) to encode + prefix: Prefix to add + **kwargs: Additional arguments + + Returns: + Document embeddings + """ + return self.encode(documents, prefix=prefix, query_mode=False, **kwargs) + + def set_query_mode(self, query_mode: bool = True) -> None: + """ + Set the default encoding mode. + + Args: + query_mode: True for query mode, False for document mode + """ + self.query_mode = query_mode + logger.info(f"Set default encoding mode to {'query' if query_mode else 'document'}") + + def _add_eos_func(self, text: str | list[str]) -> str | list[str]: + """Add EOS token to text if available.""" + try: + eos_token = getattr(self.model.tokenizer, "eos_token") + except AttributeError: + return text + + if isinstance(text, str): + return text + eos_token + elif isinstance(text, list): + return [t + eos_token for t in text] + return text + + def get_output_dim(self) -> int: + """Get the dimensionality of output embeddings.""" + return self.model.get_sentence_embedding_dimension() + + def set_max_seq_length(self, max_seq_length: int | None = None) -> None: + """Set maximum sequence length.""" + if max_seq_length: + self.model.max_seq_length = max_seq_length + self.max_seq_length = max_seq_length + logger.info(f"Set max_seq_length to {max_seq_length}") + + def reset_max_seq_length(self) -> None: + """Reset max sequence length to model's original value.""" + try: + logger.info(f"Reset max_seq_length to {self._orig_max_length}") + self.model.max_seq_length = self._orig_max_length + self.max_seq_length = self._orig_max_length + except AttributeError: + logger.warning("Failed to reset max_seq_length - original value not available") + + def __repr__(self) -> str: + return f"GemmaEmbedder(model='{self.model.model_name}', device='{self.model.device}')" diff --git a/src/jmteb/embedders/openai_embedder.py b/src/jmteb/embedders/openai_embedder.py index 6ea8b8f..029b0a5 100644 --- a/src/jmteb/embedders/openai_embedder.py +++ b/src/jmteb/embedders/openai_embedder.py @@ -2,7 +2,11 @@ from dataclasses import dataclass +from os import PathLike +from pathlib import Path + import numpy as np +import tqdm import tiktoken from loguru import logger from openai import OpenAI @@ -14,7 +18,7 @@ class OpenAIEmbedderConfig: max_output_dim: int encoder_name: str - max_token_length: int + max_seq_length: int OPENAI_EMBEDDERS = { @@ -28,7 +32,12 @@ class OpenAIEmbedderConfig: class OpenAIEmbedder(TextEmbedder): """Embedder via OpenAI API.""" - def __init__(self, model: str = "text-embedding-3-small", dim: int | None = None) -> None: + def __init__( + self, + model: str = "text-embedding-3-small", + dim: int | None = None, + max_seq_length: int | None = None, + ) -> None: """Setup. model and dim: see https://platform.openai.com/docs/models/embeddings `text-embedding-3-large` model: max 3072 dim @@ -44,13 +53,19 @@ def __init__(self, model: str = "text-embedding-3-small", dim: int | None = None Args: model (str, optional): Name of an OpenAI embedding model. Defaults to "text-embedding-3-small". dim (int, optional): Output dimension. Defaults to 1536. + max_seq_length (int, optional): Maximum length of sequences. Default to None. """ self.client = OpenAI() # API key written in .env assert model in OPENAI_EMBEDDERS.keys(), f"`model` must be one of {list(OPENAI_EMBEDDERS.keys())}!" self.model = model model_config = OPENAI_EMBEDDERS[model] self.encoding = tiktoken.get_encoding(model_config.encoder_name) - self.max_token_length = model_config.max_token_length + self._orig_max_length = model_config.max_seq_length + if max_seq_length: + self.max_seq_length = max_seq_length + else: + self.max_seq_length = model_config.max_seq_length + if not dim or model == "text-embedding-ada-002": self.dim = model_config.max_output_dim else: @@ -70,16 +85,22 @@ def encode(self, text: str | list[str], prefix: str | None = None) -> np.ndarray token_ids: list[int] = self.encode_and_truncate_text(text, prefix) else: token_ids: list[list[int]] = [self.encode_and_truncate_text(t, prefix) for t in text] - result = np.asarray( - [ - data.embedding - for data in self.client.embeddings.create( - input=token_ids, - model=self.model, - **kwargs, - ).data - ] - ) + try: + result = np.asarray( + [ + data.embedding + for data in self.client.embeddings.create( + input=token_ids, + model=self.model, + **kwargs, + ).data + ] + ) + except Exception as e: + logger.error(f"{len(text)=}") + logger.error(f"{len(token_ids)=}") + raise e + if result.shape[0] == 1: return result.reshape(-1) return result @@ -94,4 +115,86 @@ def encode_and_truncate_text(self, text: str, prefix: str | None = None) -> list text = " " logger.warning("Found empty string!") # Ignore prefix in OpenAIEmbedder - return self.encoding.encode(text)[: self.max_token_length] + return self.encoding.encode(text)[: self.max_seq_length] + + def _batch_encode_and_save_on_disk( + self, + text_list: list[str], + save_path: str | PathLike[str], + prefix: str | None = None, + batch_size: int = 256, + dtype: str = "float32", + **kwargs, + ) -> np.memmap: + """ + Encode a list of texts and save the embeddings on disk using memmap. + + Args: + text_list (list[str]): list of texts + save_path (str): path to save the embeddings + prefix (str, optional): the prefix to use for encoding. Default to None. + dtype (str, optional): data type. Defaults to "float32". + batch_size (int): batch size. Defaults to 64. + """ + + batch_size = 512 + num_samples = len(text_list) + output_dim = self.get_output_dim() + embeddings = np.memmap(save_path, dtype=dtype, mode="w+", shape=(num_samples, output_dim)) + + with tqdm.tqdm(total=num_samples, desc="Encoding") as pbar: + for i in range(0, num_samples, batch_size): + batch = text_list[i : i + batch_size] + try: + batch_embeddings: np.ndarray = self.encode(batch, prefix=prefix, **kwargs) + except Exception: + logger.error(f"{batch_size=}, {len(batch)=}") + logger.warning("Batch too large, retrying with batch size 16") + # Retry with batch size 16 + small_batch_size = 16 + batch_embeddings_list = [] + for j in range(0, len(batch), small_batch_size): + small_batch = batch[j : j + small_batch_size] + small_batch_embeddings = self.encode(small_batch, prefix=prefix, **kwargs) + batch_embeddings_list.append(small_batch_embeddings) + batch_embeddings = np.vstack(batch_embeddings_list) + embeddings[i : i + batch_size] = batch_embeddings + pbar.update(len(batch)) + + embeddings.flush() + return np.memmap(save_path, dtype=dtype, mode="r", shape=(num_samples, output_dim)) + + def batch_encode_with_cache( + self, + text_list: list[str], + prefix: str | None = None, + cache_path: str | PathLike[str] | None = None, + overwrite_cache: bool = False, + dtype: str = "float32", + **kwargs, + ) -> np.ndarray: + """ + Encode a list of texts and save the embeddings on disk using memmap if cache_path is provided. + + Args: + text_list (list[str]): list of texts + prefix (str, optional): the prefix to use for encoding. Default to None. + cache_path (str, optional): path to save the embeddings. Defaults to None. + overwrite_cache (bool, optional): whether to overwrite the cache. Defaults to False. + dtype (str, optional): data type. Defaults to "float32". + """ + + logger.warning(f"Encoding with OpenAI embedder. {kwargs=}") + if cache_path is None: + logger.info("Encoding embeddings") + return self.encode(text_list, prefix=prefix, **kwargs) + + if Path(cache_path).exists() and not overwrite_cache: + logger.info(f"Loading embeddings from {cache_path}") + return np.memmap(cache_path, dtype=dtype, mode="r", shape=(len(text_list), self.get_output_dim())) + + logger.info(f"Encoding and saving embeddings to {cache_path}") + embeddings = self._batch_encode_and_save_on_disk( + text_list, cache_path, prefix=prefix, batch_size=self._chunk_size, dtype=dtype, **kwargs + ) + return embeddings diff --git a/src/jmteb/embedders/plamo_embedder.py b/src/jmteb/embedders/plamo_embedder.py new file mode 100644 index 0000000..f2c6755 --- /dev/null +++ b/src/jmteb/embedders/plamo_embedder.py @@ -0,0 +1,251 @@ +import numpy as np +import torch +from loguru import logger +from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer + +from jmteb.embedders.base import TextEmbedder + + +class PlamoEmbedder(TextEmbedder): + """ + PLaMO embedding model embedder with multi-GPU support. + + This class supports the PLaMO-Embedding-1B model from Preferred Networks. + It uses the model's specialized encode_query and encode_document methods + for optimal performance in different use cases. + """ + + def __init__( + self, + model_name_or_path: str = "pfnet/plamo-embedding-1b", + batch_size: int = 2, + device: str | None = None, + normalize_embeddings: bool = False, + max_seq_length: int | None = None, + query_mode: bool = False, + model_kwargs: dict = {}, + tokenizer_kwargs: dict = {}, + ) -> None: + """ + Initialize the PLaMO embedder. + + Args: + model_name_or_path: Path or name of the PLaMO model + batch_size: Batch size for encoding + device: Device to use ('cuda', 'cpu', or None for auto) + normalize_embeddings: Whether to normalize embeddings + max_seq_length: Maximum sequence length (default: model's max) + query_mode: Whether to use query encoding mode by default + model_kwargs: Additional kwargs for model loading + tokenizer_kwargs: Additional kwargs for tokenizer loading + """ + model_kwargs = self._model_kwargs_parser(model_kwargs) + + # Load model and tokenizer with trust_remote_code=True for PLaMO + self.model: PreTrainedModel = AutoModel.from_pretrained( + model_name_or_path, trust_remote_code=True, **model_kwargs + ) + self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, trust_remote_code=True, **tokenizer_kwargs + ) + + self.batch_size = batch_size + self.normalize_embeddings = normalize_embeddings + self.query_mode = query_mode + + # Set up device + if not device and torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = device or "cpu" + + # Move model to device + self.model.to(self.device) + + # Enable simple multi-GPU support with DataParallel if multiple GPUs available + if torch.cuda.device_count() > 1 and self.device == "cuda": + logger.info(f"Using {torch.cuda.device_count()} GPUs with DataParallel") + self.model = torch.nn.DataParallel(self.model) + self.is_data_parallel = True + self.distributed_state = True # For compatibility with tests + else: + self.is_data_parallel = False + self.distributed_state = None + + # Store the device for easy access + self.model_device = next(self.model.parameters()).device + logger.info(f"Model device: {self.model_device}, GPU count: {torch.cuda.device_count()}") + + # Set up sequence length + self._orig_max_length = getattr( + self.model.config if not self.is_data_parallel else self.model.module.config, + "max_position_embeddings", + 4096, + ) + self.max_seq_length = max_seq_length or self._orig_max_length + + # PLaMO-Embedding-1B has 2048 embedding dimensions + self.output_dim = getattr( + self.model.config if not self.is_data_parallel else self.model.module.config, "hidden_size", 2048 + ) + + # Set output format based on model kwargs + if "torch_dtype" in model_kwargs: + self.set_output_tensor() + else: + self.set_output_numpy() + + def get_output_dim(self) -> int: + """Get the dimensionality of output embeddings.""" + return self.output_dim + + def encode(self, text: str | list[str], prefix: str | None = None, **kwargs) -> np.ndarray | torch.Tensor: + """ + Encode text into embeddings using PLaMO's specialized methods. + + This method is compatible with the base TextEmbedder interface and works + seamlessly with batch_encode_with_cache. + + Args: + text: Input text(s) to encode + prefix: Prefix to add to texts + **kwargs: Additional arguments (supports query_mode for specialized encoding) + + Returns: + Embeddings as numpy array or torch tensor + """ + if isinstance(text, str): + text = [text] + text_was_str = True + else: + text_was_str = False + + # Check for query_mode in kwargs, otherwise use instance default + use_query_mode = kwargs.get("query_mode", self.query_mode) + + # Apply prefix if provided + if prefix: + text = [prefix + t for t in text] + + # Encode using PLaMO's specialized methods + with torch.inference_mode(): + embeddings = self._encode_batch(text, use_query_mode) + + # Apply normalization if requested + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + if text_was_str: + res = embeddings.view(-1) + else: + res = embeddings + + if self.convert_to_numpy: + return res.cpu().numpy() if res.is_cuda else res.numpy() + else: + return res + + def _encode_batch(self, text: list[str], query_mode: bool = False) -> torch.Tensor: + """ + Encode a batch of texts using PLaMO's specialized methods with memory optimization. + + Args: + text: List of texts to encode + query_mode: Whether to use query or document encoding + + Returns: + Batch embeddings as torch tensor + """ + if len(text) == 0: + return torch.empty(0, self.output_dim, device=self.model_device) + + # Process in reasonable chunks for PLaMO + chunk_size = self.batch_size + all_embeddings = [] + + # Get the actual model (handle DataParallel wrapper) + actual_model = self.model.module if self.is_data_parallel else self.model + + with torch.inference_mode(): + for i in range(0, len(text), chunk_size): + chunk = text[i : i + chunk_size] + + try: + if query_mode: + # Use PLaMO's encode_query method for queries + chunk_embeddings = actual_model.encode_query(chunk, self.tokenizer) + else: + # Use PLaMO's encode_document method for documents + chunk_embeddings = actual_model.encode_document(chunk, self.tokenizer) + + # Keep embeddings on device + all_embeddings.append(chunk_embeddings) + + except torch.cuda.OutOfMemoryError: + # If still OOM, try processing one by one + logger.warning(f"OOM with chunk size {len(chunk)}, falling back to single item processing") + torch.cuda.empty_cache() + + for single_text in chunk: + if query_mode: + single_embedding = actual_model.encode_query([single_text], self.tokenizer) + else: + single_embedding = actual_model.encode_document([single_text], self.tokenizer) + all_embeddings.append(single_embedding) + torch.cuda.empty_cache() + + # Concatenate all embeddings + if all_embeddings: + return torch.cat(all_embeddings, dim=0) + else: + return torch.empty(0, self.output_dim, device=self.model_device) + + def encode_queries( + self, queries: str | list[str], prefix: str | None = None, **kwargs + ) -> np.ndarray | torch.Tensor: + """ + Convenience method to encode queries using query mode. + + Args: + queries: Query text(s) to encode + prefix: Prefix to add + **kwargs: Additional arguments + + Returns: + Query embeddings + """ + return self.encode(queries, prefix=prefix, query_mode=True, **kwargs) + + def encode_documents( + self, documents: str | list[str], prefix: str | None = None, **kwargs + ) -> np.ndarray | torch.Tensor: + """ + Convenience method to encode documents using document mode. + + Args: + documents: Document text(s) to encode + prefix: Prefix to add + **kwargs: Additional arguments + + Returns: + Document embeddings + """ + return self.encode(documents, prefix=prefix, query_mode=False, **kwargs) + + def set_query_mode(self, query_mode: bool = True) -> None: + """ + Set the default encoding mode. + + Args: + query_mode: True for query mode, False for document mode + """ + self.query_mode = query_mode + logger.info(f"Set default encoding mode to {'query' if query_mode else 'document'}") + + def reset_max_seq_length(self) -> None: + """Reset max sequence length to model's original value.""" + if hasattr(self, "_orig_max_length") and self._orig_max_length: + self.max_seq_length = self._orig_max_length + logger.info(f"Reset max_seq_length to {self._orig_max_length}") + else: + logger.warning("Failed to reset max_seq_length - original value not available") diff --git a/src/jmteb/embedders/sbert_embedder.py b/src/jmteb/embedders/sbert_embedder.py index ba33a36..892f703 100644 --- a/src/jmteb/embedders/sbert_embedder.py +++ b/src/jmteb/embedders/sbert_embedder.py @@ -1,6 +1,7 @@ from __future__ import annotations import numpy as np +from loguru import logger from sentence_transformers import SentenceTransformer from jmteb.embedders.base import TextEmbedder @@ -29,6 +30,7 @@ def __init__( model_kwargs=model_kwargs, # https://github.com/UKPLab/sentence-transformers/blob/84f69fee6dcde023f46a8807e89bc99a7700ba82/sentence_transformers/SentenceTransformer.py#L81-L105 # noqa: E501 tokenizer_kwargs=tokenizer_kwargs, ) + self._orig_max_length = self.model.max_seq_length if max_seq_length: self.model.max_seq_length = max_seq_length @@ -70,3 +72,10 @@ def _add_eos_func(self, text: str | list[str]) -> str | list[str]: def get_output_dim(self) -> int: return self.model.get_sentence_embedding_dimension() + + def reset_max_seq_length(self): + try: + logger.info(f"Reset `max_seq_length` to {self._orig_max_length}") + self.model.max_seq_length = self._orig_max_length + except AttributeError: + pass diff --git a/src/jmteb/embedders/transformers_embedder.py b/src/jmteb/embedders/transformers_embedder.py index 0592061..721e0c9 100644 --- a/src/jmteb/embedders/transformers_embedder.py +++ b/src/jmteb/embedders/transformers_embedder.py @@ -48,6 +48,7 @@ def __init__( logger.info(f"{self.model.device=}, {torch.cuda.device_count()=}") self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs) + self._orig_max_length = getattr(self.model, "max_seq_length", None) self.max_seq_length = getattr(self.model, "max_seq_length", None) if max_seq_length: self.max_seq_length = max_seq_length @@ -135,7 +136,9 @@ def _encode_batch(self, text: list[str], prefix: str | None = None) -> torch.Ten if self.add_eos: text = self._add_eos_func(text) - encoded_input = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(self.model.device) + encoded_input = self.tokenizer( + text, padding=True, truncation=True, return_tensors="pt", max_length=self.max_seq_length + ).to(self.model.device) model_output = self.model(**encoded_input) last_hidden_states = model_output["last_hidden_state"] features = { diff --git a/src/jmteb/evaluators/classification/evaluator.py b/src/jmteb/evaluators/classification/evaluator.py index c2b8836..bb3a4ca 100644 --- a/src/jmteb/evaluators/classification/evaluator.py +++ b/src/jmteb/evaluators/classification/evaluator.py @@ -66,13 +66,22 @@ def __call__( if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + encode_kwargs = self.encode_kwargs.copy() + + # Check if this is a PlamoEmbedder and set optimal encoding mode + if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"): + if "query_mode" not in encode_kwargs: + encode_kwargs["query_mode"] = False # Use document mode for classification texts + logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for classification texts") + logger.info("Encoding training and validation sentences...") X_train = model.batch_encode_with_cache( [item.text for item in self.train_dataset], prefix=self.prefix, cache_path=Path(cache_dir) / "train_embeddings.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) y_train = [item.label for item in self.train_dataset] @@ -81,7 +90,7 @@ def __call__( prefix=self.prefix, cache_path=Path(cache_dir) / "val_embeddings.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) y_val = [item.label for item in self.val_dataset] @@ -95,7 +104,7 @@ def __call__( prefix=self.prefix, cache_path=Path(cache_dir) / "test_embeddings.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) y_test = [item.label for item in self.test_dataset] diff --git a/src/jmteb/evaluators/clustering/evaluator.py b/src/jmteb/evaluators/clustering/evaluator.py index 2b8cdf2..bbce269 100644 --- a/src/jmteb/evaluators/clustering/evaluator.py +++ b/src/jmteb/evaluators/clustering/evaluator.py @@ -14,6 +14,7 @@ MiniBatchKMeans, ) from sklearn.metrics import homogeneity_completeness_v_measure +from sklearn.preprocessing import normalize from jmteb.embedders.base import TextEmbedder from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults @@ -57,13 +58,22 @@ def __call__( if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + encode_kwargs = self.encode_kwargs.copy() + + # Check if this is a PlamoEmbedder and set optimal encoding mode + if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"): + if "query_mode" not in encode_kwargs: + encode_kwargs["query_mode"] = False # Use document mode for clustering texts + logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for clustering texts") + logger.info("Converting validation data to embeddings...") val_embeddings = model.batch_encode_with_cache( [item.text for item in self.val_dataset], prefix=self.prefix, cache_path=Path(cache_dir) / "val_embeddings.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) val_labels = [item.label for item in self.val_dataset] @@ -77,7 +87,7 @@ def __call__( prefix=self.prefix, cache_path=Path(cache_dir) / "test_embeddings.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) test_labels = [item.label for item in self.test_dataset] @@ -127,7 +137,19 @@ def __call__( def _evaluate_clustering_model( embeddings: np.ndarray, y_true: list[int], clustering_model: ClusterMixin ) -> tuple[dict[str, float], list[int]]: - y_pred = clustering_model.fit_predict(embeddings) + try: + # First try without normalization to preserve original behavior when possible + y_pred = clustering_model.fit_predict(embeddings) + except ValueError as e: + # If overflow error occurs, apply normalization and retry + if "infinity" in str(e).lower() or "too large" in str(e).lower(): + logger.warning(f"Overflow detected in clustering, applying L2 normalization: {e}") + embeddings_normalized = normalize(embeddings, norm="l2") + y_pred = clustering_model.fit_predict(embeddings_normalized) + else: + # Re-raise if it's a different ValueError + raise e + h_score, c_score, v_score = homogeneity_completeness_v_measure( labels_pred=y_pred, labels_true=np.array(y_true) ) diff --git a/src/jmteb/evaluators/pair_classification/evaluator.py b/src/jmteb/evaluators/pair_classification/evaluator.py index ef466bf..8fba017 100644 --- a/src/jmteb/evaluators/pair_classification/evaluator.py +++ b/src/jmteb/evaluators/pair_classification/evaluator.py @@ -49,8 +49,19 @@ def __call__( if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + encode_kwargs = self.encode_kwargs.copy() + + # Check if this is a PlamoEmbedder and set optimal encoding mode + if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"): + if "query_mode" not in encode_kwargs: + encode_kwargs["query_mode"] = False # Use document mode for pair classification texts + from loguru import logger + + logger.info(f"Auto-optimized {model.__class__.__name__}: query_mode=False for pair classification texts") + val_embeddings1, val_embeddings2, val_golden_labels = self._convert_to_embeddings( - model, self.val_dataset, "dev", overwrite_cache, cache_dir + model, self.val_dataset, "dev", overwrite_cache, cache_dir, encode_kwargs ) if self.val_dataset == self.test_dataset: test_embeddings1, test_embeddings2, test_golden_labels = ( @@ -60,7 +71,7 @@ def __call__( ) else: test_embeddings1, test_embeddings2, test_golden_labels = self._convert_to_embeddings( - model, self.test_dataset, "test", overwrite_cache, cache_dir + model, self.test_dataset, "test", overwrite_cache, cache_dir, encode_kwargs ) val_results = {} @@ -119,20 +130,24 @@ def _convert_to_embeddings( split: str = "test", overwrite_cache: bool = False, cache_dir: str | None = None, + encode_kwargs: dict | None = None, ) -> tuple[np.ndarray, np.ndarray, list[float]]: + if encode_kwargs is None: + encode_kwargs = self.encode_kwargs + embeddings1 = model.batch_encode_with_cache( [item.sentence1 for item in dataset], prefix=self.sentence1_prefix, cache_path=Path(cache_dir) / f"{split}_embeddings1.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) embeddings2 = model.batch_encode_with_cache( [item.sentence2 for item in dataset], prefix=self.sentence2_prefix, cache_path=Path(cache_dir) / f"{split}_embeddings2.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) golden_labels = [item.label for item in dataset] return embeddings1, embeddings2, golden_labels diff --git a/src/jmteb/evaluators/reranking/evaluator.py b/src/jmteb/evaluators/reranking/evaluator.py index 144ed36..0d1be95 100644 --- a/src/jmteb/evaluators/reranking/evaluator.py +++ b/src/jmteb/evaluators/reranking/evaluator.py @@ -38,6 +38,8 @@ class RerankingEvaluator(EmbeddingEvaluator): query_prefix (str | None): prefix for queries. Defaults to None. doc_prefix (str | None): prefix for documents. Defaults to None. log_predictions (bool): whether to log predictions of each datapoint. Defaults to False. + force_max_length (bool): whether to overwrite the global max_length with model's maximum token length. + Defaults to False. top_n_docs_to_log (int): log only top n documents. Defaults to 5. query_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding queries. Defaults to {}. doc_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding documents. Defaults to {}. @@ -53,6 +55,7 @@ def __init__( doc_prefix: str | None = None, log_predictions: bool = False, top_n_docs_to_log: int = 5, + force_max_length: bool = False, query_encode_kwargs: dict = {}, doc_encode_kwargs: dict = {}, ) -> None: @@ -65,6 +68,7 @@ def __init__( self.doc_prefix = doc_prefix self.log_predictions = log_predictions self.top_n_docs_to_log = top_n_docs_to_log + self.force_max_length = force_max_length self.query_encode_kwargs = query_encode_kwargs self.doc_encode_kwargs = doc_encode_kwargs @@ -75,15 +79,33 @@ def __call__( overwrite_cache: bool = False, ) -> EvaluationResults: model.set_output_tensor() + if self.force_max_length: + model.reset_max_seq_length() + if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + query_kwargs = self.query_encode_kwargs.copy() + doc_kwargs = self.doc_encode_kwargs.copy() + + # Check if this is a PlamoEmbedder and set optimal encoding modes + if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"): + if "query_mode" not in query_kwargs: + query_kwargs["query_mode"] = True # Use query mode for queries + if "query_mode" not in doc_kwargs: + doc_kwargs["query_mode"] = False # Use document mode for docs + logger.info( + f"Auto-optimized {model.__class__.__name__}: query_mode=True for queries," + "query_mode=False for documents" + ) + val_query_embeddings = model.batch_encode_with_cache( text_list=[item.query for item in self.val_query_dataset], prefix=self.query_prefix, cache_path=Path(cache_dir) / "val_query.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.query_encode_kwargs, + **query_kwargs, ) if self.val_query_dataset == self.test_query_dataset: test_query_embeddings = val_query_embeddings @@ -93,14 +115,14 @@ def __call__( prefix=self.query_prefix, cache_path=Path(cache_dir) / "test_query.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.query_encode_kwargs, + **query_kwargs, ) doc_embeddings = model.batch_encode_with_cache( text_list=[item.text for item in self.doc_dataset], prefix=self.doc_prefix, cache_path=Path(cache_dir) / "corpus.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.doc_encode_kwargs, + **doc_kwargs, ) logger.info("Start reranking") @@ -211,8 +233,6 @@ def _format_predictions( pred_docs: list[RerankingDoc] = [ doc_dataset[doc_dataset.docid_to_idx[pred_docid]] for pred_docid in pred_docids ] - logger.info(f"{golden_docs=}") - logger.info(f"{pred_docs=}") prediction = RerankingPrediction( query=q.query, relevant_docs=golden_docs, diff --git a/src/jmteb/evaluators/retrieval/evaluator.py b/src/jmteb/evaluators/retrieval/evaluator.py index 2fd6a21..fc7476e 100644 --- a/src/jmteb/evaluators/retrieval/evaluator.py +++ b/src/jmteb/evaluators/retrieval/evaluator.py @@ -41,6 +41,8 @@ class RetrievalEvaluator(EmbeddingEvaluator): query_prefix (str | None): prefix for queries. Defaults to None. doc_prefix (str | None): prefix for documents. Defaults to None. log_predictions (bool): whether to log predictions of each datapoint. Defaults to False. + force_max_length (bool): whether to overwrite the global max_length with model's maximum token length. + Defaults to False. top_n_docs_to_log (int): log only top n documents that are predicted as relevant. Defaults to 5. query_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding queries. Defaults to {}. doc_encode_kwargs (dict): kwargs passed to embedder's encode function when encoding documents. Defaults to {}. @@ -58,6 +60,7 @@ def __init__( doc_prefix: str | None = None, log_predictions: bool = False, top_n_docs_to_log: int = 5, + force_max_length: bool = False, query_encode_kwargs: dict = {}, doc_encode_kwargs: dict = {}, ) -> None: @@ -67,7 +70,7 @@ def __init__( self.doc_chunk_size = doc_chunk_size - self.accuracy_at_k = accuracy_at_k or [1, 3, 5, 10] + self.accuracy_at_k = accuracy_at_k or [1, 3, 5, 10, 20, 30, 50] self.ndcg_at_k = ndcg_at_k or [10] self.max_top_k = max(sum([self.accuracy_at_k, self.ndcg_at_k], [])) self.main_metric = f"ndcg@{self.ndcg_at_k[0]}" @@ -76,6 +79,7 @@ def __init__( self.doc_prefix = doc_prefix self.log_predictions = log_predictions self.top_n_docs_to_log = top_n_docs_to_log + self.force_max_length = force_max_length self.query_encode_kwargs = query_encode_kwargs self.doc_encode_kwargs = doc_encode_kwargs @@ -86,15 +90,32 @@ def __call__( overwrite_cache: bool = False, ) -> EvaluationResults: model.set_output_tensor() + if self.force_max_length: + model.reset_max_seq_length() if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + query_kwargs = self.query_encode_kwargs.copy() + doc_kwargs = self.doc_encode_kwargs.copy() + + # Check if this is a PlamoEmbedder and set optimal encoding modes + if model.__class__.__name__ in ("PlamoEmbedder", "GemmaEmbedder"): + if "query_mode" not in query_kwargs: + query_kwargs["query_mode"] = True # Use query mode for queries + if "query_mode" not in doc_kwargs: + doc_kwargs["query_mode"] = False # Use document mode for docs + logger.info( + f"Auto-optimized {model.__class__.__name__}: query_mode=True for queries," + "query_mode=False for documents" + ) + val_query_embeddings = model.batch_encode_with_cache( text_list=[item.query for item in self.val_query_dataset], prefix=self.query_prefix, cache_path=Path(cache_dir) / "val_query.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.query_encode_kwargs, + **query_kwargs, ) if self.val_query_dataset == self.test_query_dataset: test_query_embeddings = val_query_embeddings @@ -104,7 +125,7 @@ def __call__( prefix=self.query_prefix, cache_path=Path(cache_dir) / "test_query.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.query_encode_kwargs, + **query_kwargs, ) doc_embeddings = model.batch_encode_with_cache( @@ -112,7 +133,7 @@ def __call__( prefix=self.doc_prefix, cache_path=Path(cache_dir) / "corpus.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.doc_encode_kwargs, + **doc_kwargs, ) logger.info("Start retrieval") diff --git a/src/jmteb/evaluators/sts/evaluator.py b/src/jmteb/evaluators/sts/evaluator.py index 380ceea..f2cbd0c 100644 --- a/src/jmteb/evaluators/sts/evaluator.py +++ b/src/jmteb/evaluators/sts/evaluator.py @@ -8,6 +8,7 @@ import numpy as np import torch +from loguru import logger from scipy.stats import pearsonr, spearmanr from torch import Tensor @@ -52,8 +53,17 @@ def __call__( if cache_dir is not None: Path(cache_dir).mkdir(parents=True, exist_ok=True) + # Auto-optimize for PlamoEmbedder if no explicit kwargs provided + encode_kwargs = self.encode_kwargs.copy() + + # # Check if this is a PlamoEmbedder and set optimal encoding mode + # if model.__class__.__name__ == "PlamoEmbedder": + # if "query_mode" not in encode_kwargs: + # encode_kwargs["query_mode"] = False # Use document mode for STS texts + # logger.info("Auto-optimized PlamoEmbedder: query_mode=False for STS texts") + val_embeddings1, val_embeddings2, val_golden_scores = self._convert_to_embeddings( - model, self.val_dataset, "dev", overwrite_cache, cache_dir + model, self.val_dataset, "dev", overwrite_cache, cache_dir, encode_kwargs ) if self.val_dataset == self.test_dataset: test_embeddings1, test_embeddings2, test_golden_scores = ( @@ -62,7 +72,7 @@ def __call__( val_golden_scores, ) test_embeddings1, test_embeddings2, test_golden_scores = self._convert_to_embeddings( - model, self.test_dataset, "test", overwrite_cache, cache_dir + model, self.test_dataset, "test", overwrite_cache, cache_dir, encode_kwargs ) similarity_functions = { @@ -146,20 +156,24 @@ def _convert_to_embeddings( split: str = "test", overwrite_cache: bool = False, cache_dir: str | None = None, + encode_kwargs: dict | None = None, ) -> tuple[Tensor, Tensor, list[float]]: + if encode_kwargs is None: + encode_kwargs = self.encode_kwargs + embeddings1 = model.batch_encode_with_cache( [item.sentence1 for item in dataset], prefix=self.sentence1_prefix, cache_path=Path(cache_dir) / f"{split}_embeddings1.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) embeddings2 = model.batch_encode_with_cache( [item.sentence2 for item in dataset], prefix=self.sentence2_prefix, cache_path=Path(cache_dir) / f"{split}_embeddings2.bin" if cache_dir is not None else None, overwrite_cache=overwrite_cache, - **self.encode_kwargs, + **encode_kwargs, ) device = "cuda" if torch.cuda.is_available() else "cpu" embeddings1 = convert_to_tensor(embeddings1, device) diff --git a/src/jmteb/utils/score_recorder.py b/src/jmteb/utils/score_recorder.py index afbf22c..361c809 100644 --- a/src/jmteb/utils/score_recorder.py +++ b/src/jmteb/utils/score_recorder.py @@ -56,8 +56,21 @@ def record_predictions(self, results: EvaluationResults, dataset_name: str, task def record_summary(self): if not self.save_dir: return - summary: dict[str, dict[str, dict[str, float]]] = defaultdict(dict) + + summary_path = Path(self.save_dir) / "summary.json" + + # Load existing summary if it exists + if summary_path.exists(): + with open(summary_path, "r") as fin: + summary = json.load(fin) + else: + summary = {} + + # Merge new results into existing summary for task_name, task_scores in self.scores.items(): + if task_name not in summary: + summary[task_name] = {} for dataset_name, results in self.scores[task_name].items(): summary[task_name][dataset_name] = {results.metric_name: results.metric_value} - self.save_to_json(summary, Path(self.save_dir) / "summary.json") + + self.save_to_json(summary, summary_path)