diff --git a/huggingface_data/huggingface_datasets/updated_dataset_index_file.json b/huggingface_data/huggingface_datasets/updated_dataset_index_file.json new file mode 100644 index 000000000..d14ce0541 --- /dev/null +++ b/huggingface_data/huggingface_datasets/updated_dataset_index_file.json @@ -0,0 +1 @@ +{"lmqg/qg_itquad": {"dataset_name": "lmqg/qg_itquad", "description": "[SQuAD-it](https://huggingface.co/datasets/squad_it) dataset for question generation (QG) task.", "downloads": 66, "configs": {"qg_itquad": {"config_name": "qg_itquad", "sample_row": "{\"answer\": \"\\\"Carlo III\\\"\", \"paragraph_question\": \"\\\"question: Il figlio di chi \\\\u00e8 morto sulla str...\", \"question\": \"\\\"Il figlio di chi \\\\u00e8 morto sulla strada per Pa...\", \"sentence\": \"\\\"Carlo III scelse Palermo per la sua incoronazione...\", \"paragraph\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"sentence_answer\": \"\\\" Carlo III scelse Palermo per la sua inc...\", \"paragraph_answer\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"paragraph_sentence\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"paragraph_id\": \"\\\"572963fb3f37b3190047831b\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD-it](https://huggingface.co/datasets/squad_it) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_itquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:it", "question-generation"], "is_gated": false}, "lmqg/qg_dequad": {"dataset_name": "lmqg/qg_dequad", "description": "[GermanSQuAD](https://huggingface.co/datasets/deepset/germanquad) dataset for question generation (QG) task.", "downloads": 96, "configs": {"qg_dequad": {"config_name": "qg_dequad", "sample_row": "{\"answer\": \"\\\"UNESCO-Welterbe\\\"\", \"paragraph_question\": \"\\\"question: Welche Auszeichnung hat die Wartburg 19...\", \"question\": \"\\\"Welche Auszeichnung hat die Wartburg 1999 erhalte...\", \"sentence\": \"\\\"Zum UNESCO-Welterbe in Th\\\\u00fcringen geh\\\\u00f6re...\", \"paragraph\": \"\\\"Th\\\\u00fcringen\\\\n\\\\n== Kultur ==\\\\nDie Kulturlandsch...\", \"sentence_answer\": \"\\\"Zum UNESCO-Welterbe in Th\\\\u00fcringen g...\", \"paragraph_answer\": \"\\\"Th\\\\u00fcringen == Kultur == Die Kulturlandschaft ...\", \"paragraph_sentence\": \"\\\"Th\\\\u00fcringen == Kultur = = Die Kulturlandschaft...\", \"paragraph_id\": \"\\\"47512\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[GermanSQuAD](https://huggingface.co/datasets/deepset/germanquad) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_dequad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:deepset/germanquad", "language:de", "question-generation"], "is_gated": false}, "JeremyAlain/123_test": {"dataset_name": "JeremyAlain/123_test", "description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"", "downloads": 716, "configs": {"data_0": {"config_name": "data_0", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}, "data_1": {"config_name": "data_1", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}, "data_2": {"config_name": "data_2", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}}, "tags": ["task_categories:multiple-choice", "task_categories:question-answering", "task_categories:zero-shot-classification", "task_categories:text2text-generation", "task_categories:table-question-answering", "task_categories:text-generation", "task_categories:text-classification", "task_categories:tabular-classification", "task_ids:multiple-choice-qa", "task_ids:extractive-qa", "task_ids:open-domain-qa", "task_ids:closed-domain-qa", "task_ids:closed-book-qa", "task_ids:open-book-qa", "task_ids:language-modeling", "task_ids:multi-class-classification", "task_ids:natural-language-inference", "task_ids:topic-classification", "task_ids:multi-label-classification", "task_ids:tabular-multi-class-classification", "task_ids:tabular-multi-label-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "language:en"], "is_gated": false}, "sst2": {"dataset_name": "sst2", "description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.", "downloads": 59683, "configs": {"default": {"config_name": "default", "sample_row": "{\"idx\": \"0\", \"sentence\": \"\\\"hide new secretions from the parental units \\\"\", \"label\": \"0\"}", "columns": ["idx", "sentence", "label"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "label": "label"}, "dataset_description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.\n", "dataset_name": "sst2"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "taskydata/tasky_or_not": {"dataset_name": "taskydata/tasky_or_not", "description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.", "downloads": 92, "configs": {"10xp3_10xc4": {"config_name": "10xp3_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3ni_10xc4": {"config_name": "10xp3ni_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirst_10xc4": {"config_name": "10xp3nirst_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbb_10xc4": {"config_name": "10xp3nirstbb_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflan_10xc4": {"config_name": "10xp3nirstbbflan_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanse_10xc4": {"config_name": "10xp3nirstbbflanse_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanseuni_10xc4": {"config_name": "10xp3nirstbbflanseuni_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanse_5xc4": {"config_name": "10xp3nirstbbflanse_5xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "v_1": {"config_name": "v_1", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}}, "tags": ["task_categories:text-classification", "language:en"], "is_gated": false}, "codeparrot/apps": {"dataset_name": "codeparrot/apps", "description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.", "downloads": 23087, "configs": {"all": {"config_name": "all", "sample_row": "{\"problem_id\": \"0\", \"question\": \"\\\"Polycarp has $n$ different binary words. A word c...\", \"solutions\": \"\\\"[\\\\\\\"for _ in range(int(input())):\\\\\\\\n n = int(in...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"4\\\\\\\\n4\\\\\\\\n0001\\\\\\\\n1000\\\\\\\\n0...\", \"difficulty\": \"\\\"interview\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1259/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "introductory": {"config_name": "introductory", "sample_row": "{\"problem_id\": \"2361\", \"question\": \"\\\"You are given an array $a$ of length $n$ consisti...\", \"solutions\": \"\\\"[\\\\\\\"from collections import defaultdict as dd\\\\\\\\nfr...\", \"input_output\": \"\\\"{\\\\\\\"inputs\\\\\\\": [\\\\\\\"6\\\\\\\\n1\\\\\\\\n2\\\\\\\\n3\\\\\\\\n4\\\\\\\\n5\\\\\\\\n6\\\\\\\\n\\\\\\\"], ...\", \"difficulty\": \"\\\"introductory\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1353/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "interview": {"config_name": "interview", "sample_row": "{\"problem_id\": \"0\", \"question\": \"\\\"Polycarp has $n$ different binary words. A word c...\", \"solutions\": \"\\\"[\\\\\\\"for _ in range(int(input())):\\\\\\\\n n = int(in...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"4\\\\\\\\n4\\\\\\\\n0001\\\\\\\\n1000\\\\\\\\n0...\", \"difficulty\": \"\\\"interview\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1259/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "competition": {"config_name": "competition", "sample_row": "{\"problem_id\": \"2000\", \"question\": \"\\\"Codefortia is a small island country located some...\", \"solutions\": \"\\\"[\\\\\\\"import heapq\\\\\\\\nn,m,a,b=map(int,input().split()...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"5 5 20 25\\\\\\\\n1 2 25\\\\\\\\n2 ...\", \"difficulty\": \"\\\"competition\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1149/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "language:code"], "is_gated": false}, "vesteinn/sosialurin-faroese-pos": {"dataset_name": "vesteinn/sosialurin-faroese-pos", "description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with grammatical information (word class, gender, number etc.)", "downloads": 44, "configs": {"sosialurin-faroese-pos": {"config_name": "sosialurin-faroese-pos", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Gu\\\\u00f0ri\\\\u00f0\\\", \\\"Poulsen\\\", \\\"\\\\u00ed\\\", \\\"Riberh\\\\...\", \"pos_tags\": \"[277, 327, 111, 318]\"}", "columns": ["id", "tokens", "pos_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags"}, "dataset_description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with grammatical information (word class, gender, number etc.)\n", "dataset_name": "vesteinn/sosialurin-faroese-pos"}}, "tags": [], "is_gated": false}, "vadis/sv-ident": {"dataset_name": "vadis/sv-ident", "description": "The SV-Ident corpus (version 0.3) is a collection of 4,248 expert-annotated English\nand German sentences from social science publications, supporting the task of\nmulti-label text classification.", "downloads": 15, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"After the Fukushima nuclear power plant accident,...\", \"is_variable\": \"0\", \"variable\": \"[]\", \"research_data\": \"[]\", \"doc_id\": \"\\\"61806\\\"\", \"uuid\": \"\\\"a08ee188-e5d0-491b-861d-17d3ee5990fd\\\"\", \"lang\": \"\\\"en\\\"\"}", "columns": ["sentence", "is_variable", "variable", "research_data", "doc_id", "uuid", "lang"], "columns_mapping": {"sentence": "sentence", "is_variable": "is_variable", "variable": "variable", "research_data": "research_data", "doc_id": "doc_id", "uuid": "uuid", "lang": "lang"}, "dataset_description": "The SV-Ident corpus (version 0.3) is a collection of 4,248 expert-annotated English\nand German sentences from social science publications, supporting the task of\nmulti-label text classification.\n", "dataset_name": "vadis/sv-ident"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:de"], "is_gated": false}, "launch/open_question_type": {"dataset_name": "launch/open_question_type", "description": "Open-ended question type annotated dataset.", "downloads": 46, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"1000491\\\"\", \"question\": \"\\\"When two bacteria exchange genetic information, w...\", \"annotator1\": \"[\\\"concept\\\", null]\", \"annotator2\": \"[\\\"concept\\\", null]\", \"resolve_type\": \"\\\"concept\\\"\"}", "columns": ["id", "question", "annotator1", "annotator2", "resolve_type"], "columns_mapping": {"id": "id", "question": "question", "annotator1": "annotator1", "annotator2": "annotator2", "resolve_type": "resolve_type"}, "dataset_description": "Open-ended question type annotated dataset.\n", "dataset_name": "launch/open_question_type"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/scitail": {"dataset_name": "bigbio/scitail", "description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.", "downloads": 74, "configs": {"scitail_source": {"config_name": "scitail_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"premise\": \"\\\"Pluto rotates once on its axis every 6.39 Earth d...\", \"hypothesis\": \"\\\"Earth rotates on its axis once times in one day.\\\"...\", \"label\": \"\\\"neutral\\\"\"}", "columns": ["id", "premise", "hypothesis", "label"], "columns_mapping": {"id": "id", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.\n", "dataset_name": "bigbio/scitail"}, "scitail_bigbio_te": {"config_name": "scitail_bigbio_te", "sample_row": "{\"id\": \"\\\"0\\\"\", \"premise\": \"\\\"Pluto rotates once on its axis every 6.39 Earth d...\", \"hypothesis\": \"\\\"Earth rotates on its axis once times in one day.\\\"...\", \"label\": \"\\\"neutral\\\"\"}", "columns": ["id", "premise", "hypothesis", "label"], "columns_mapping": {"id": "id", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.\n", "dataset_name": "bigbio/scitail"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "MicPie/unpredictable_mmo-champion-com": {"dataset_name": "MicPie/unpredictable_mmo-champion-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "downloads": 16, "configs": {"default": {"config_name": "default", "sample_row": "{\"task\": \"\\\"de717468_on_Strategy__Loot__Discussions__Type\\\"\", \"input\": \"\\\"[Level] 397 [Spec] Tank [Slot] Finger [Name] Hard...\", \"output\": \"\\\"Finger\\\"\", \"options\": \"[[\\\"F\\\", \\\"i\\\", \\\"n\\\", \\\"g\\\", \\\"e\\\", \\\"r\\\"], [\\\"T\\\", \\\"r\\\", \\\"i\\\", \\\"...\", \"pageTitle\": \"\\\"Ultraxion Strategy, Loot, Discussions\\\"\", \"outputColName\": \"\\\"Type\\\"\", \"url\": \"\\\"http://www.mmo-champion.com/threads/1026785-Ultra...\", \"wdcFile\": \"\\\"36/1438042989443.69_20150728002309-00296-ip-10-23...\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.\n", "dataset_name": "MicPie/unpredictable_mmo-champion-com"}}, "tags": ["task_categories:multiple-choice", "task_categories:question-answering", "task_categories:zero-shot-classification", "task_categories:text2text-generation", "task_categories:table-question-answering", "task_categories:text-generation", "task_categories:text-classification", "task_categories:tabular-classification", "task_ids:multiple-choice-qa", "task_ids:extractive-qa", "task_ids:open-domain-qa", "task_ids:closed-domain-qa", "task_ids:closed-book-qa", "task_ids:open-book-qa", "task_ids:language-modeling", "task_ids:multi-class-classification", "task_ids:natural-language-inference", "task_ids:topic-classification", "task_ids:multi-label-classification", "task_ids:tabular-multi-class-classification", "task_ids:tabular-multi-label-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "language:en"], "is_gated": false}, "BDas/Turkish-Dataset": {"dataset_name": "BDas/Turkish-Dataset", "description": "The dataset, prepared in Turkish, includes 53.000 tests, 53.000 validations and 160600 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 18, "configs": {"TurkishData": {"config_name": "TurkishData", "sample_row": "{\"text\": \"\\\"\\\\ufeffevimizde bulunan beko marka klima sogutma v...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in Turkish, includes 53.000 tests, 53.000 validations and 160600 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/Turkish-Dataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:tr"], "is_gated": false}, "SocialGrep/one-year-of-tsla-on-reddit": {"dataset_name": "SocialGrep/one-year-of-tsla-on-reddit", "description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.", "downloads": 10, "configs": {"posts": {"config_name": "posts", "sample_row": "{\"type\": \"\\\"post\\\"\", \"id\": \"\\\"vrkdvj\\\"\", \"subreddit.id\": \"\\\"4430vb\\\"\", \"subreddit.name\": \"\\\"ultraalgo\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1656977609\", \"permalink\": \"\\\"https://old.reddit.com/r/UltraAlgo/comments/vrkdv...\", \"domain\": \"\\\"pbs.twimg.com\\\"\", \"url\": \"\\\"http://pbs.twimg.com/media/FW2_yU1WQAEiRgC.jpg\\\"...\", \"selftext\": \"\\\"\\\"\", \"title\": \"\\\"$TSLA $1038 net profit across 11 trades. 90% Accu...\", \"score\": \"2\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "domain", "url", "selftext", "title", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "domain": "domain", "url": "url", "selftext": "selftext", "title": "title", "score": "score"}, "dataset_description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.\n", "dataset_name": "SocialGrep/one-year-of-tsla-on-reddit"}, "comments": {"config_name": "comments", "sample_row": "{\"type\": \"1\", \"id\": \"\\\"ievql0n\\\"\", \"subreddit.id\": \"\\\"2rndg\\\"\", \"subreddit.name\": \"\\\"valueinvesting\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1656978625\", \"permalink\": \"\\\"https://old.reddit.com/r/ValueInvesting/comments/...\", \"body\": \"\\\"When TSLA was at 1K share price :\\\\n\\\\n8B sales/1B ...\", \"sentiment\": \"0.296\", \"score\": \"1\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "body", "sentiment", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "body": "body", "sentiment": "sentiment", "score": "score"}, "dataset_description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.\n", "dataset_name": "SocialGrep/one-year-of-tsla-on-reddit"}}, "tags": ["annotations_creators:lexyr", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "Heriot-WattUniversity/dialog_babi": {"dataset_name": "Heriot-WattUniversity/dialog_babi", "description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.", "downloads": 13, "configs": {"task1-API-calls": {"config_name": "task1-API-calls", "sample_row": "{\"user_turns\": \"[\\\"hi\\\", \\\"can you book a table\\\", \\\"\\\", \\\"i lov...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task2-API-refine": {"config_name": "task2-API-refine", "sample_row": "{\"user_turns\": \"[\\\"hello\\\", \\\"can you make a restaurant reservation w...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task3-options": {"config_name": "task3-options", "sample_row": "{\"user_turns\": \"[\\\"good morning\\\", \\\"may i have a table\\\", \\\"\\\"...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\", \"kb_facts.turn_id\": \"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...\", \"kb_facts.fact\": \"[\\\"1 resto_rome_cheap_indian_6stars R_phone resto_r...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task4-phone-address": {"config_name": "task4-phone-address", "sample_row": "{\"user_turns\": \"[\\\"hi\\\", \\\"can you make a restaurant reservation at r...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"great le...\", \"kb_facts.turn_id\": \"[0, 1, 2, 3, 4, 5, 6]\", \"kb_facts.fact\": \"[\\\"1 resto_rome_moderate_spanish_1stars R_phone res...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task5-full-dialogs": {"config_name": "task5-full-dialogs", "sample_row": "{\"user_turns\": \"[\\\"good morning\\\", \\\"i'd like to book a table with it...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\", \"kb_facts.turn_id\": \"[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...\", \"kb_facts.fact\": \"[\\\"13 resto_madrid_cheap_spanish_1stars R_phone res...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task6-dstc2": {"config_name": "task6-dstc2", "sample_row": "{\"user_turns\": \"[\\\"\\\", \\\"i want a moderately priced restaura...\", \"system_turns\": \"[\\\"Hello , welcome to the Cambridge restaurant syst...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}}, "tags": [], "is_gated": false}, "chenz16/curriculum_benchmark": {"dataset_name": "chenz16/curriculum_benchmark", "description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.", "downloads": 63, "configs": {"analytic": {"config_name": "analytic", "sample_row": "{\"premise\": \"\\\"Exactly six trade representatives negotiate a tre...\", \"hypothesis\": \"\\\"Klosnik, Londi, Manley, Poirier, Neri, Osata\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "defeasible": {"config_name": "defeasible", "sample_row": "{\"premise\": \"\\\"PersonX finds a kitten ; PersonX works for a cat ...\", \"hypothesis\": \"\\\"As a result, PersonX feels compassionate\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "boolean": {"config_name": "boolean", "sample_row": "{\"premise\": \"\\\"Dustin, Milton, Louis, Bill, Roland, Dean, Tim, M...\", \"hypothesis\": \"\\\"Bill didn't visit Ecuador\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"contradiction\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "comparative": {"config_name": "comparative", "sample_row": "{\"premise\": \"\\\"Morris is as tall as Derek , Derek is as tall as ...\", \"hypothesis\": \"\\\"Angel is taller than Morris\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"contradiction\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "conditional": {"config_name": "conditional", "sample_row": "{\"premise\": \"\\\"Raul has not visited Moline, Anthony has not visi...\", \"hypothesis\": \"\\\"Louis has not visited Mundelein\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "context_align": {"config_name": "context_align", "sample_row": "{\"premise\": \"\\\"the nails were something\\\"\", \"hypothesis\": \"\\\"'something' here should be 'flatten the ends (of ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "control": {"config_name": "control", "sample_row": "{\"premise\": \"\\\"100 Years of the Western Workplace Conditions in ...\", \"hypothesis\": \"\\\"Improvements in medicine led to workers earning m...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"neutral\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "coreference": {"config_name": "coreference", "sample_row": "{\"premise\": \"\\\"Ian volunteered to eat Dennis's menudo after alre...\", \"hypothesis\": \"\\\"Ian despised eating intestine.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "cosmoqa": {"config_name": "cosmoqa", "sample_row": "{\"premise\": \"\\\"Good Old War and person L : I saw both of these b...\", \"hypothesis\": \"\\\"This person likes music and likes to see the show...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "counterfactual": {"config_name": "counterfactual", "sample_row": "{\"premise\": \"\\\" if the stimulus bill had become hamstrung ...\", \"hypothesis\": \"\\\" should be \\\\\\\"I don't think any of us---even...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "counting": {"config_name": "counting", "sample_row": "{\"premise\": \"\\\"Troy has visited Djibouti, France, Senegal, Argen...\", \"hypothesis\": \"\\\"Troy has visited less than thirty-five places\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "drop": {"config_name": "drop", "sample_row": "{\"premise\": \"\\\"To start the season, the Lions traveled south to ...\", \"hypothesis\": \"\\\"3 points did the buccaneers need to tie in the fi...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "entailment_tree": {"config_name": "entailment_tree", "sample_row": "{\"premise\": \"\\\"leo is a kind of constellation sent2: the earth r...\", \"hypothesis\": \"\\\"the earth revolving around the sun causes leo to ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "ester": {"config_name": "ester", "sample_row": "{\"premise\": \"\\\"A senior researcher with the State Council, or th...\", \"hypothesis\": \"\\\"promote urbanization\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hellaswag": {"config_name": "hellaswag", "sample_row": "{\"premise\": \"\\\"[header] How to treat your girlfriend like a prin...\", \"hypothesis\": \"\\\"[substeps] Your girlfriend should be more than an...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hypernymy": {"config_name": "hypernymy", "sample_row": "{\"premise\": \"\\\"he disliked his neighbors\\\"\", \"hypothesis\": \"\\\", the word or phrase is best characterized as a ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hyponymy": {"config_name": "hyponymy", "sample_row": "{\"premise\": \"\\\"crochet a bedspread\\\"\", \"hypothesis\": \"\\\"a specific type of crochet is double crochet (or ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "kg_relations": {"config_name": "kg_relations", "sample_row": "{\"premise\": \"\\\"Diplomats say Assad 's absence from the meeting a...\", \"hypothesis\": \"\\\"Assad was buried in Syria .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "lexical": {"config_name": "lexical", "sample_row": "{\"premise\": \"\\\"Gonorrhea means the presence of bacteria.\\\"\", \"hypothesis\": \"\\\"Gonorrhea is caused by bacteria.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "logiqa": {"config_name": "logiqa", "sample_row": "{\"premise\": \"\\\"Some Cantonese don't like chili, so some southern...\", \"hypothesis\": \"\\\"All Cantonese are southerners\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "monotonicity_infer": {"config_name": "monotonicity_infer", "sample_row": "{\"premise\": \"\\\"Tom said that neither parents had ever been to Bo...\", \"hypothesis\": \"\\\"Tom said that neither one of his parents had ever...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "negation": {"config_name": "negation", "sample_row": "{\"premise\": \"\\\"Ted has only visited Bahrain, Terrence has only v...\", \"hypothesis\": \"\\\"Jessie didn't visit Rwanda\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "ner": {"config_name": "ner", "sample_row": "{\"premise\": \"\\\"The government urged Western and Arab nations to ...\", \"hypothesis\": \"\\\"Western is a person\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "physicalqa": {"config_name": "physicalqa", "sample_row": "{\"premise\": \"\\\"When boiling butter, when it's ready, you can\\\"\", \"hypothesis\": \"\\\"Pour it onto a plate\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "puns": {"config_name": "puns", "sample_row": "{\"premise\": \"\\\"Michaela heard that the agreeable tennis umpire w...\", \"hypothesis\": \"\\\"Michaela heard a pun\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "quantifier": {"config_name": "quantifier", "sample_row": "{\"premise\": \"\\\"Everyone has visited every place\\\"\", \"hypothesis\": \"\\\"Floyd didn't visit Johnny\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"neutral\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "sentiment": {"config_name": "sentiment", "sample_row": "{\"premise\": \"\\\"When asked about the product, Eniyah said, 'I had...\", \"hypothesis\": \"\\\"Eniyah liked the product . \\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "socialqa": {"config_name": "socialqa", "sample_row": "{\"premise\": \"\\\"Cameron decided to have a barbecue and gathered h...\", \"hypothesis\": \"\\\"Others would feel like attending\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "spatial": {"config_name": "spatial", "sample_row": "{\"premise\": \"\\\"The triangle is above the pink rectangle. The blu...\", \"hypothesis\": \"\\\"The pink rectangle is to the right of the blue sq...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "sprl": {"config_name": "sprl", "sample_row": "{\"premise\": \"\\\"( Both took further hits yesterday . )\\\"\", \"hypothesis\": \"\\\"Further hits existed during the taking.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "syntactic_alternation": {"config_name": "syntactic_alternation", "sample_row": "{\"premise\": \"\\\"michael passed the salt to the person across the ...\", \"hypothesis\": \"\\\"michael passed the person across the table the sa...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "syntactic_variation": {"config_name": "syntactic_variation", "sample_row": "{\"premise\": \"\\\"Amrozi accused his brother , whom he called \\\\\\\" th...\", \"hypothesis\": \"\\\"Referring to him as only \\\\\\\" the witness \\\\\\\" , Amro...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "temporal": {"config_name": "temporal", "sample_row": "{\"premise\": \"\\\" I was so nervous for my first day of school. \\\\\\\"W...\", \"hypothesis\": \"\\\" The teacher asked us to stop talking starts afte...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "transitive": {"config_name": "transitive", "sample_row": "{\"premise\": \"\\\"a particular person was n't blessed to have a par...\", \"hypothesis\": \"\\\"that person might or might not have had that thin...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "verbcorner": {"config_name": "verbcorner", "sample_row": "{\"premise\": \"\\\"Samantha enjoyed the blinch.\\\"\", \"hypothesis\": \"\\\"Something good happened .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "verbnet": {"config_name": "verbnet", "sample_row": "{\"premise\": \"\\\"David constructed a house .\\\"\", \"hypothesis\": \"\\\"David caused the constructing .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}}, "tags": [], "is_gated": false}, "biglam/atypical_animacy": {"dataset_name": "biglam/atypical_animacy", "description": "Atypical animacy detection dataset, based on nineteenth-century sentences in English extracted from an open dataset of nineteenth-century books digitized by the British Library (available via https://doi.org/10.21250/db14, British Library Labs, 2014). \nThis dataset contains 598 sentences containing mentions of machines. Each sentence has been annotated according to the animacy and humanness of the machine in the sentence.", "downloads": 20, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"002732647_02_180_7\\\"\", \"sentence\": \"\\\"Poetic RMS OF THE CITY OF MANCHESTEI legends migh...\", \"context\": \"\\\"That there was a Roman camp on Castlefield, with-...\", \"target\": \"\\\"engine\\\"\", \"animacy\": \"0.0\", \"humanness\": \"0.0\", \"offsets\": \"[134, 140]\", \"date\": \"\\\"1891\\\"\"}", "columns": ["id", "sentence", "context", "target", "animacy", "humanness", "offsets", "date"], "columns_mapping": {"id": "id", "sentence": "sentence", "context": "context", "target": "target", "animacy": "animacy", "humanness": "humanness", "offsets": "offsets", "date": "date"}, "dataset_description": "Atypical animacy detection dataset, based on nineteenth-century sentences in English extracted from an open dataset of nineteenth-century books digitized by the British Library (available via https://doi.org/10.21250/db14, British Library Labs, 2014). \nThis dataset contains 598 sentences containing mentions of machines. Each sentence has been annotated according to the animacy and humanness of the machine in the sentence. \n", "dataset_name": "biglam/atypical_animacy"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:intent-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "codeparrot/xlcost-text-to-code": {"dataset_name": "codeparrot/xlcost-text-to-code", "description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "downloads": 634, "configs": {"Python-snippet-level": {"config_name": "Python-snippet-level", "sample_row": "{\"text\": \"\\\"Python3 implementation of the above approach\\\"\", \"code\": \"\\\"def maxPresum ( a , b ) : NEW_LINE\\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Python-program-level": {"config_name": "Python-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"def maxPresum ( a , b ) : NEW_LINE INDENT X = max...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C-snippet-level": {"config_name": "C-snippet-level", "sample_row": "{\"text\": \"\\\"C program for above approach\\\"\", \"code\": \"\\\"#include \\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C-program-level": {"config_name": "C-program-level", "sample_row": "{\"text\": \"\\\"Minimum number of coins having value equal to pow...\", \"code\": \"\\\"#include NEW_LINE void count_setbit ( i...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Java-snippet-level": {"config_name": "Java-snippet-level", "sample_row": "{\"text\": \"\\\"Java Program to implement the above approach\\\"\", \"code\": \"\\\"import java . util . * ; class GFG { static int m...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Java-program-level": {"config_name": "Java-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"import java . util . * ; class GFG { static int m...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Javascript-snippet-level": {"config_name": "Javascript-snippet-level", "sample_row": "{\"text\": \"\\\"Javascript Program to implement the above approac...\", \"code\": \"\\\"function maxPresum ( a , b ) {\\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Javascript-program-level": {"config_name": "Javascript-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"function maxPresum ( a , b ) { let X = Math . max...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Csharp-snippet-level": {"config_name": "Csharp-snippet-level", "sample_row": "{\"text\": \"\\\"C # Program to implement the above approach\\\"\", \"code\": \"\\\"using System ; using System . Collections . Gener...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Csharp-program-level": {"config_name": "Csharp-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"using System ; using System . Collections . Gener...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C++-snippet-level": {"config_name": "C++-snippet-level", "sample_row": "{\"text\": \"\\\"C ++ Program to implement the above approach\\\"\", \"code\": \"\\\"#include NEW_LINE using namespace...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C++-program-level": {"config_name": "C++-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"#include NEW_LINE using namespace...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "PHP-snippet-level": {"config_name": "PHP-snippet-level", "sample_row": "{\"text\": \"\\\"Function that returns true if the number represen...\", \"code\": \"\\\"< ? php function isEven ( $ arr , $ n , $ r ) {\\\"...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "PHP-program-level": {"config_name": "PHP-program-level", "sample_row": "{\"text\": \"\\\"Check if the number is even or odd whose digits a...\", \"code\": \"\\\"< ? php function isEven ( $ arr , $ n , $ r ) { i...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:multilingual", "language:code"], "is_gated": false}, "demelin/moral_stories": {"dataset_name": "demelin/moral_stories", "description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.", "downloads": 299, "configs": {"full": {"config_name": "full", "sample_row": "{\"ID\": \"\\\"37TD41K0AI7TYQGNUFTSCYCNT25SCN\\\"\", \"norm\": \"\\\"It's responsible to keep children safe.\\\"\", \"situation\": \"\\\"Kent was watching his kids playing in the backyar...\", \"intention\": \"\\\"Kent wants to add security to his back yard.\\\"\", \"moral_action\": \"\\\"Kent installs cameras around his yard to look for...\", \"moral_consequence\": \"\\\"Kent's kids feel much safer with the camera's wat...\", \"immoral_action\": \"\\\"Kent installs an electric fence around his yard t...\", \"immoral_consequence\": \"\\\"One of Kent's kids gets shocked by the fence when...\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-lexical_bias": {"config_name": "cls-action-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-minimal_pairs": {"config_name": "cls-action-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-norm_distance": {"config_name": "cls-action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-lexical_bias": {"config_name": "cls-action+context-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"Joan is on a first date with Mitch when she gets ...\", \"intention\": \"\\\"Joan wants to have a fun night.\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-minimal_pairs": {"config_name": "cls-action+context-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"Jenny's husband asks her to go hiking with him li...\", \"intention\": \"\\\"Jenny wants to tell her husband if she'll go hiki...\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-norm_distance": {"config_name": "cls-action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-lexical_bias": {"config_name": "cls-action+context+consequence-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"Joan is on a first date with Mitch when she gets ...\", \"intention\": \"\\\"Joan wants to have a fun night.\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"Mitch has a great time on his date with Joan and ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-minimal_pairs": {"config_name": "cls-action+context+consequence-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"Jenny's husband asks her to go hiking with him li...\", \"intention\": \"\\\"Jenny wants to tell her husband if she'll go hiki...\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"Jenny's husband is happy that they still share ac...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-norm_distance": {"config_name": "cls-action+context+consequence-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-lexical_bias": {"config_name": "cls-action+norm-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-minimal_pairs": {"config_name": "cls-action+norm-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-norm_distance": {"config_name": "cls-action+norm-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-lexical_bias": {"config_name": "cls-consequence+action-lexical_bias", "sample_row": "{\"ID\": \"\\\"39L1G8WVWRP5R6LAO337NULKXIJ1341\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jacob invites Martin to join him and his friends ...\", \"moral_consequence\": \"\\\"Martin feels included and welcome.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-minimal_pairs": {"config_name": "cls-consequence+action-minimal_pairs", "sample_row": "{\"ID\": \"\\\"3AAPLD8UCDFLUY4I57CJXU5P86UTHE1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Keisha decides to grill dinner so her kids can ru...\", \"moral_consequence\": \"\\\"Keisha's kids get exercise and stay healthy.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-norm_distance": {"config_name": "cls-consequence+action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-lexical_bias": {"config_name": "cls-consequence+action+context-lexical_bias", "sample_row": "{\"ID\": \"\\\"39L1G8WVWRP5R6LAO337NULKXIJ1341\\\"\", \"norm\": \"\\\"It is kind to be welcoming to a new student\\\"\", \"situation\": \"\\\"It's lunchtime and Jacob is entering the cafeteri...\", \"intention\": \"\\\"Jacob wants to find a table in the cafeteria.\\\"\", \"moral_action\": \"\\\"Jacob invites Martin to join him and his friends ...\", \"moral_consequence\": \"\\\"Martin feels included and welcome.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-minimal_pairs": {"config_name": "cls-consequence+action+context-minimal_pairs", "sample_row": "{\"ID\": \"\\\"3AAPLD8UCDFLUY4I57CJXU5P86UTHE1\\\"\", \"norm\": \"\\\"It's good to have your kids get some fresh air.\\\"...\", \"situation\": \"\\\"Keisha is at home with her two children and is co...\", \"intention\": \"\\\"Keisha wants to keep her kids occupied.\\\"\", \"moral_action\": \"\\\"Keisha decides to grill dinner so her kids can ru...\", \"moral_consequence\": \"\\\"Keisha's kids get exercise and stay healthy.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-norm_distance": {"config_name": "cls-consequence+action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-action$context-norm_distance": {"config_name": "gen-action$context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-action$context+consequence-norm_distance": {"config_name": "gen-action$context+consequence-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-consequence$action-norm_distance": {"config_name": "gen-consequence$action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-consequence$action+context-norm_distance": {"config_name": "gen-consequence$action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions-norm_distance": {"config_name": "gen-norm$actions-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions+context-norm_distance": {"config_name": "gen-norm$actions+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions+context+consequences-norm_distance": {"config_name": "gen-norm$actions+context+consequences-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"Phil manages to cut the water bill in half before...\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}}, "tags": ["task_categories:multiple-choice", "task_categories:text-generation", "task_categories:text-classification", "task_ids:multiple-choice-qa", "task_ids:language-modeling", "task_ids:text-scoring", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ArthurBaia/squad_v1_pt_br": {"dataset_name": "ArthurBaia/squad_v1_pt_br", "description": "This dataset was translated by Deep Learning Brazil", "downloads": 101, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"title\": \"\\\"University_of_Notre_Dame\\\"\", \"context\": \"\\\"Arquitetonicamente, a escola tem um car\\\\u00e1ter ...\", \"question\": \"\\\"A quem a Virgem Maria supostamente apareceu em 18...\", \"answers.text\": \"[\\\"Santa Bernadette Soubirous\\\"]\", \"answers.answer_start\": \"[533]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "This dataset was translated by Deep Learning Brazil\n", "dataset_name": "ArthurBaia/squad_v1_pt_br"}}, "tags": [], "is_gated": false}, "nbroad/mediasum": {"dataset_name": "nbroad/mediasum", "description": "This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, \ncollected from interview transcripts and overview / topic descriptions from NPR and CNN.", "downloads": 16, "configs": {"mediasum": {"config_name": "mediasum", "sample_row": "{\"id\": \"\\\"NPR-1\\\"\", \"program\": \"\\\"News & Notes\\\"\", \"date\": \"\\\"2007-11-28\\\"\", \"url\": \"\\\"https://www.npr.org/templates/story/story.php?sto...\", \"title\": \"\\\"Black Actors Give Bible Star Appeal\\\"\", \"summary\": \"\\\"More than 400 black actors, artists and ministers...\", \"utt\": \"[\\\"Now, moving on, Forest Whitaker as Moses, Tisha ...\", \"speaker\": \"[\\\"FARAI CHIDEYA, host\\\", \\\"FARAI CHIDEYA, host\\\", \\\"Mr...\"}", "columns": ["id", "program", "date", "url", "title", "summary", "utt", "speaker"], "columns_mapping": {"id": "id", "program": "program", "date": "date", "url": "url", "title": "title", "summary": "summary", "utt": "utt", "speaker": "speaker"}, "dataset_description": "This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, \ncollected from interview transcripts and overview / topic descriptions from NPR and CNN.\n", "dataset_name": "nbroad/mediasum"}}, "tags": ["task_categories:summarization", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/conll2003": {"dataset_name": "tner/conll2003", "description": "[CoNLL 2003 NER dataset](https://aclanthology.org/W03-0419/)", "downloads": 21, "configs": {"conll2003": {"config_name": "conll2003", "sample_row": "{\"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"tags\": \"[1, 0, 2, 0, 0, 0, 2, 0, 0]\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[CoNLL 2003 NER dataset](https://aclanthology.org/W03-0419/)", "dataset_name": "tner/conll2003"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/wnut2017": {"dataset_name": "tner/wnut2017", "description": "[WNUT 2017 NER dataset](https://aclanthology.org/W17-4418/)", "downloads": 84, "configs": {"wnut2017": {"config_name": "wnut2017", "sample_row": "{\"tokens\": \"[\\\"@paulwalk\\\", \\\"It\\\", \\\"'s\\\", \\\"the\\\", \\\"view\\\", \\\"from\\\", \\\"...\", \"tags\": \"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[WNUT 2017 NER dataset](https://aclanthology.org/W17-4418/)", "dataset_name": "tner/wnut2017"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/bc5cdr": {"dataset_name": "tner/bc5cdr", "description": "[Bio Creative 5 CDR NER dataset](https://academic.oup.com/database/article/doi/10.1093/database/baw032/2630271?login=true)", "downloads": 958, "configs": {"bc5cdr": {"config_name": "bc5cdr", "sample_row": "{\"tokens\": \"[\\\"Naloxone\\\", \\\"reverses\\\", \\\"the\\\", \\\"antihypertensive\\\"...\", \"tags\": \"[1, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[Bio Creative 5 CDR NER dataset](https://academic.oup.com/database/article/doi/10.1093/database/baw032/2630271?login=true)", "dataset_name": "tner/bc5cdr"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "pyronear/openfire": {"dataset_name": "pyronear/openfire", "description": "OpenFire is an image classification dataset for wildfire detection, collected\nfrom web searches.", "downloads": 131, "configs": {"default": {"config_name": "default", "sample_row": "{\"image_url\": \"\\\"https://get.pxhere.com/photo/cloud-sky-atmosphere...\", \"is_wildfire\": \"false\"}", "columns": ["image_url", "is_wildfire"], "columns_mapping": {"image_url": "image_url", "is_wildfire": "is_wildfire"}, "dataset_description": "OpenFire is an image classification dataset for wildfire detection, collected\nfrom web searches.\n", "dataset_name": "pyronear/openfire"}}, "tags": ["task_categories:image-classification", "annotations_creators:crowdsourced", "source_datasets:original"], "is_gated": false}, "biglam/clmet_3_1": {"dataset_name": "biglam/clmet_3_1", "description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification", "downloads": 46, "configs": {"plain": {"config_name": "plain", "sample_row": "{\"text\": \"\\\"\\\\nA TREATISE Concerning the PRINCIPLES OF Human K...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}, "class": {"config_name": "class", "sample_row": "{\"text\": \"[\\\"A\\\", \\\"TREATISE\\\", \\\"Concerning\\\", \\\"the\\\", \\\"PRINCIPLES...\", \"pos_tags\": \"[2, 8, 11, 2, 8, 5, 8, 8, 7, 8, 6, 7, 1, 2, 0, 8, ...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "pos_tags", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "pos_tags": "pos_tags", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}, "pos": {"config_name": "pos", "sample_row": "{\"text\": \"[\\\"A\\\", \\\"TREATISE\\\", \\\"Concerning\\\", \\\"the\\\", \\\"PRINCIPLES...\", \"pos_tags\": \"[2, 10, 28, 2, 12, 5, 12, 12, 38, 10, 16, 38, 18, ...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "pos_tags", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "pos_tags": "pos_tags", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}}, "tags": ["task_categories:text-classification", "task_categories:fill-mask", "task_ids:multi-label-classification", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "breakend/nllb-multi-domain": {"dataset_name": "breakend/nllb-multi-domain", "description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.", "downloads": 29, "configs": {"eng_Latn-ayr_Latn": {"config_name": "eng_Latn-ayr_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_ayr_Latn\": \"\\\"Phisqha alwa pachaw sartapxta ukatx utaj jak\\\\u201...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_ayr_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_ayr_Latn": "sentence_ayr_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-bho_Deva": {"config_name": "eng_Latn-bho_Deva", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_bho_Deva\": \"\\\"\\\\u0939\\\\u092e \\\\u0938\\\\u0941\\\\u092c\\\\u0939 5 \\\\u092c\\\\u0...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_bho_Deva"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_bho_Deva": "sentence_bho_Deva"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-dyu_Latn": {"config_name": "eng_Latn-dyu_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_dyu_Latn\": \"\\\"An wila la s\\\\u0254g\\\\u0254ma fitiri f\\\\u025b ka kil...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_dyu_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_dyu_Latn": "sentence_dyu_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-fur_Latn": {"config_name": "eng_Latn-fur_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_fur_Latn\": \"\\\"Si sin dismots aes 5 di buinore, o vin fat une gj...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_fur_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_fur_Latn": "sentence_fur_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-rus_Cyrl": {"config_name": "eng_Latn-rus_Cyrl", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_rus_Cyrl\": \"\\\"\\\\u041c\\\\u044b \\\\u0432\\\\u0441\\\\u0442\\\\u0430\\\\u043b\\\\u0438...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_rus_Cyrl"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_rus_Cyrl": "sentence_rus_Cyrl"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-wol_Latn": {"config_name": "eng_Latn-wol_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_wol_Latn\": \"\\\"Jur\\\\u00f3omi waxtu ci suba la\\\\u00f1u jog ba noppi...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_wol_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_wol_Latn": "sentence_wol_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}}, "tags": ["annotations_creators:found", "multilinguality:multilingual", "multilinguality:translation", "source_datasets:extended|flores", "language:en", "language:ru", "language:ayr", "language:bho", "language:dyu", "language:fur", "language:wol"], "is_gated": false}, "muibk/wmt19_metrics_task": {"dataset_name": "muibk/wmt19_metrics_task", "description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.", "downloads": 26, "configs": {"de-cs": {"config_name": "de-cs", "sample_row": "{\"translation.de\": \"\\\"Walisische AMs (Mitglieder der Versammlung) sorge...\", \"translation.cs\": \"\\\"Welsh AMS (\\\\u010dlenov\\\\u00e9 shrom\\\\u00e1\\\\u017ed\\\\u...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"de-cs\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u010clenov\\\\u00e9 Vel\\\\u0161sk\\\\u00e9ho n\\\\u00e1rodn...\"}", "columns": ["translation_de", "translation_cs", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.cs": "translation_cs", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "de-en": {"config_name": "de-en", "sample_row": "{\"translation.de\": \"\\\"Sch\\\\u00f6ne M\\\\u00fcnchnerin 2018: Sch\\\\u00f6ne M\\\\u...\", \"translation.en\": \"\\\"Beautiful Munich 2018: Beautiful Munich 2018 in H...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"100.0\", \"wmt-z\": \"0.577333331316636\", \"pair\": \"\\\"de-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"abendzeitung-muenchen.de.213584\\\"\", \"ref\": \"\\\"The Beauty of Munich 2018: the Beauty of Munich 2...\"}", "columns": ["translation_de", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Europa-Parteitag der Linken : Kipping: Europa ist...\", \"translation.fr\": \"\\\"Europe-Congr\\\\u00e8s du parti de la Gauche : Kippi...\", \"mt_system\": \"\\\"online-G.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"22.0\", \"wmt-z\": \"-2.6893683822659\", \"pair\": \"\\\"de-fr\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"euelections\\\"\", \"ref\": \"\\\"Kipping au congr\\\\u00e8s de die Linke sur l'Europe...\"}", "columns": ["translation_de", "translation_fr", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-cs": {"config_name": "en-cs", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.cs\": \"\\\"Welsh Ams se boj\\\\u00ed o \\\\\\\"vypad\\\\u00e1 jako ply\\\\u...\", \"mt_system\": \"\\\"online-X.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"44.0\", \"wmt-z\": \"-2.03458212900238\", \"pair\": \"\\\"en-cs\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u010clenov\\\\u00e9 Vel\\\\u0161sk\\\\u00e9ho n\\\\u00e1rodn...\"}", "columns": ["translation_en", "translation_cs", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.cs": "translation_cs", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-de": {"config_name": "en-de", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.de\": \"\\\"Walisische AMs besorgt dar\\\\u00fcber, dass sie \\\\u2...\", \"mt_system\": \"\\\"Microsoft-WMT19-document-level.6808\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"99.0\", \"wmt-z\": \"0.570916127533967\", \"pair\": \"\\\"en-de\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Walisische Ageordnete sorgen sich \\\\\\\"wie D\\\\u00f6de...\"}", "columns": ["translation_en", "translation_de", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.de": "translation_de", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.fi\": \"\\\"Walesin kansalliskokouksen j\\\\u00e4senet pelk\\\\u00e...\", \"mt_system\": \"\\\"Human\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"en-fi\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Walesin kansalliskokouksen j\\\\u00e4senet pelk\\\\u00e...\"}", "columns": ["translation_en", "translation_fi", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.fi": "translation_fi", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-gu": {"config_name": "en-gu", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.gu\": \"\\\"\\\\u0ab5\\\\u0ac7\\\\u0ab2\\\\u0acd\\\\u0ab6 \\\\u0a86\\\\u0a82\\\\u0ab8...\", \"mt_system\": \"\\\"UdS-DFKI.6866\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"20.0\", \"wmt-z\": \"-1.13438006642\", \"pair\": \"\\\"en-gu\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0ab5\\\\u0ac7\\\\u0ab2\\\\u0acd\\\\u0ab8\\\\u0aa8\\\\u0abe \\\\u0a8f...\"}", "columns": ["translation_en", "translation_gu", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.gu": "translation_gu", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-kk": {"config_name": "en-kk", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.kk\": \"\\\"\\\\u0412\\\\u0435\\\\u043b\\\\u044c\\\\u0448 \\\\u0410\\\\u041c\\\\u0441...\", \"mt_system\": \"\\\"NEU.6755\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"18.5\", \"wmt-z\": \"-1.10972245101563\", \"pair\": \"\\\"en-kk\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0423\\\\u044d\\\\u043b\\\\u0441\\\\u0442\\\\u0456\\\\u04a3 \\\\u0430...\"}", "columns": ["translation_en", "translation_kk", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.kk": "translation_kk", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-lt": {"config_name": "en-lt", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.lt\": \"\\\"Welsh AMs susir\\\\u016bpin\\\\u0119 d\\\\u0117l \\\\u201ei\\\\u...\", \"mt_system\": \"\\\"TartuNLP-c.6510\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"78.0\", \"wmt-z\": \"-0.443156602337598\", \"pair\": \"\\\"en-lt\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Velso Asambl\\\\u0117jos nariai bijo b\\\\u016bti i\\\\u01...\"}", "columns": ["translation_en", "translation_lt", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.lt": "translation_lt", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.ru\": \"\\\"\\\\u0412\\\\u0430\\\\u043b\\\\u043b\\\\u0438\\\\u0439\\\\u0441\\\\u043a\\\\...\", \"mt_system\": \"\\\"TartuNLP-u.6645\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"en-ru\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0427\\\\u043b\\\\u0435\\\\u043d\\\\u044b \\\\u041d\\\\u0430\\\\u0446...\"}", "columns": ["translation_en", "translation_ru", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.zh\": \"\\\"\\\\u5a01\\\\u5c14\\\\u58eb AM \\\\u62c5\\\\u5fc3\\\\u201c\\\\u770b\\\\u8...\", \"mt_system\": \"\\\"Baidu-system.6932\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"50.0\", \"wmt-z\": \"-2.56158435006466\", \"pair\": \"\\\"en-zh\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u5a01\\\\u5c14\\\\u58eb AM \\\\u62c5\\\\u5fc3\\\\u201d\\\\u50cf\\\\u8...\"}", "columns": ["translation_en", "translation_zh", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "fi-en": {"config_name": "fi-en", "sample_row": "{\"translation.fi\": \"\\\"Eemeli Kouki johti Hurrikaanin kotivoittoon avauk...\", \"translation.en\": \"\\\"Eemeli A. Hurrikaanin led to victory at home - Sa...\", \"mt_system\": \"\\\"parfda.6526\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"66.0\", \"wmt-z\": \"-0.720709405878421\", \"pair\": \"\\\"fi-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"ess.fi.43771\\\"\", \"ref\": \"\\\"Eemeli Kouki led Hurrikaanit to home victory in t...\"}", "columns": ["translation_fi", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.fi": "translation_fi", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "fr-de": {"config_name": "fr-de", "sample_row": "{\"translation.fr\": \"\\\"Kipping au congr\\\\u00e8s de die Linke sur l'Europe...\", \"translation.de\": \"\\\"Kipping beim Linken Congress on Europe: Europa is...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"fr-de\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"euelections\\\"\", \"ref\": \"\\\"Europa-Parteitag der Linken : Kipping: Europa ist...\"}", "columns": ["translation_fr", "translation_de", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.fr": "translation_fr", "translation.de": "translation_de", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "gu-en": {"config_name": "gu-en", "sample_row": "{\"translation.gu\": \"\\\"\\\\u0aaa\\\\u0aa4\\\\u0a82\\\\u0a9c\\\\u0ab2\\\\u0ac0 \\\\u0ab2\\\\u0acb...\", \"translation.en\": \"\\\"This took pata.mjalii nikldyo frog, was super baa...\", \"mt_system\": \"\\\"UdS-DFKI.6861\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"6.0\", \"wmt-z\": \"-1.32251345941323\", \"pair\": \"\\\"gu-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"gu.webdunia.com.113\\\"\", \"ref\": \"\\\"Frog inside Patanjali \\\\u2018Aata\\\\u2019 (flour) pa...\"}", "columns": ["translation_gu", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.gu": "translation_gu", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "kk-en": {"config_name": "kk-en", "sample_row": "{\"translation.kk\": \"\\\"\\\\u0492\\\\u0430\\\\u0436\\\\u0430\\\\u0439\\\\u044b\\\\u043f \\\\u049b...\", \"translation.en\": \"\\\"\\\\u049b\\\\u04b1\\\\u043b\\\\u0430\\\\u049b\\\\u049b\\\\u0430\\\\u043f ...\", \"mt_system\": \"\\\"DBMS-KU_KKEN.6726\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"70.0\", \"wmt-z\": \"-0.649670870333131\", \"pair\": \"\\\"kk-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"egemen.kz.9219\\\"\", \"ref\": \"\\\"Wonderful headphones.\\\"\"}", "columns": ["translation_kk", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.kk": "translation_kk", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "lt-en": {"config_name": "lt-en", "sample_row": "{\"translation.lt\": \"\\\"\\\\\\\"MG Baltic\\\\\\\" byla: naujasis Gustainio advokatas ...\", \"translation.en\": \"\\\"Case of \\\\\\\"MG Baltic\\\\\\\": New Gustainis lawyer says ...\", \"mt_system\": \"\\\"online-A.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"68.0\", \"wmt-z\": \"-0.0612950663855201\", \"pair\": \"\\\"lt-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"delfi.lt.492\\\"\", \"ref\": \"\\\"MG Baltic case: a new advocate of Mr. Gustainis s...\"}", "columns": ["translation_lt", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.lt": "translation_lt", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "ru-en": {"config_name": "ru-en", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041d\\\\u0430\\\\u0437\\\\u0432\\\\u0430\\\\u043d\\\\u043e \\\\u0447...\", \"translation.en\": \"\\\"The number of recruits from Ukraine preparing to ...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"97.0\", \"wmt-z\": \"0.33353407846393\", \"pair\": \"\\\"ru-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"izvestiya.300303\\\"\", \"ref\": \"\\\"The number of new Ukrainian recruits ready to go ...\"}", "columns": ["translation_ru", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.ru": "translation_ru", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "zh-en": {"config_name": "zh-en", "sample_row": "{\"translation.zh\": \"\\\"\\\\u5f20\\\\u5149\\\\u519b\\\\u88ab\\\\u4efb\\\\u547d\\\\u4e3a\\\\u5e7f\\\\...\", \"translation.en\": \"\\\"Zhang Guangjun was appointed vice governor of Gua...\", \"mt_system\": \"\\\"online-G.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"100.0\", \"wmt-z\": \"1.0643780388098\", \"pair\": \"\\\"zh-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"chinanews.com.1423\\\"\", \"ref\": \"\\\"Zhang Guangjun was appointed as the Vice Governor...\"}", "columns": ["translation_zh", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.zh": "translation_zh", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation"], "is_gated": false}, "tarteel-ai/quranqa": {"dataset_name": "tarteel-ai/quranqa", "description": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task. AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTEC were exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use of AyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.", "downloads": 274, "configs": {"shared_task": {"config_name": "shared_task", "sample_row": "{\"pq_id\": \"\\\"2:8-16_364\\\"\", \"passage\": \"\\\"\\\\u0648\\\\u0645\\\\u0646 \\\\u0627\\\\u0644\\\\u0646\\\\u0627\\\\u0633...\", \"surah\": \"2\", \"verses\": \"\\\"8-16\\\"\", \"question\": \"\\\"\\\\u0644\\\\u0645\\\\u0627\\\\u0630\\\\u0627 \\\\u0633\\\\u064a\\\\u064f...\", \"answers.text\": \"[\\\"\\\\u0623\\\\u0648\\\\u0644\\\\u0626\\\\u0643 \\\\u0627\\\\u0644\\\\u063...\", \"answers.answer_start\": \"[504]\"}", "columns": ["pq_id", "passage", "surah", "verses", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"pq_id": "pq_id", "passage": "passage", "surah": "surah", "verses": "verses", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task. AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTEC were exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use of AyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.\n", "dataset_name": "tarteel-ai/quranqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar", "quran", "qa"], "is_gated": false}, "biglam/contentious_contexts": {"dataset_name": "biglam/contentious_contexts", "description": "This dataset contains extracts from historical Dutch newspapers which have been containing keywords of potentially contentious words (according to present-day sensibilities). \nThe dataset contains multiple annotations per instance, given the option to quantify agreement scores for annotations. This dataset can be used to track how words and their meanings have changed over time", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"extract_id\": \"\\\"H99\\\"\", \"text\": \"\\\" Hollandsche IJzeren Spoorweg-Maatschappij een vi...\", \"target\": \"\\\"\\\\ud835\\\\ude5c\\\\ud835\\\\ude5a\\\\ud835\\\\ude62\\\\ud835\\\\ude5a\\\\...\", \"annotator_responses_english\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"response\\\": \\\"Not contentious...\", \"annotator_responses_dutch\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"response\\\": \\\"Niet omstreden\\\"...\", \"annotator_suggestions\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"suggestion\\\": \\\"\\\"}, {\\\"id\\\": \\\"u...\"}", "columns": ["extract_id", "text", "target", "annotator_responses_english", "annotator_responses_dutch", "annotator_suggestions"], "columns_mapping": {"extract_id": "extract_id", "text": "text", "target": "target", "annotator_responses_english": "annotator_responses_english", "annotator_responses_dutch": "annotator_responses_dutch", "annotator_suggestions": "annotator_suggestions"}, "dataset_description": "This dataset contains extracts from historical Dutch newspapers which have been containing keywords of potentially contentious words (according to present-day sensibilities). \nThe dataset contains multiple annotations per instance, given the option to quantify agreement scores for annotations. This dataset can be used to track how words and their meanings have changed over time\n", "dataset_name": "biglam/contentious_contexts"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-scoring", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:nl", "newspapers", "historic", "dutch", "problematic", "ConConCor"], "is_gated": false}, "chintagunta85/bc2gm_test": {"dataset_name": "chintagunta85/bc2gm_test", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "downloads": 10, "configs": {"bc2gm_corpus": {"config_name": "bc2gm_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Comparison\\\", \\\"with\\\", \\\"alkaline\\\", \\\"phosphatases\\\",...\", \"ner_tags\": \"[0, 0, 1, 2, 0, 1, 2, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\n", "dataset_name": "chintagunta85/bc2gm_test"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "kiddothe2b/contract-nli": {"dataset_name": "kiddothe2b/contract-nli", "description": "ContractNLI: A Benchmark Dataset for ContractNLI in English", "downloads": 38, "configs": {"contractnli_a": {"config_name": "contractnli_a", "sample_row": "{\"premise\": \"\\\"2.3 Provided that the Recipient has a written agr...\", \"hypothesis\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"label\": \"2\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The ContractNLI dataset consists of Non-Disclosure Agreements (NDAs). All NDAs have been labeled based \non several hypothesis templates as entailment, neutral or contradiction. In this version of the task\n(Task A), the input consists of the relevant part of the document w.r.t. to the hypothesis.\n", "dataset_name": "kiddothe2b/contract-nli"}, "contractnli_b": {"config_name": "contractnli_b", "sample_row": "{\"premise\": \"\\\"NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\\\\nThi...\", \"hypothesis\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"label\": \"2\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The ContractNLI dataset consists of Non-Disclosure Agreements (NDAs). All NDAs have been labeled based \non several hypothesis templates as entailment, neutral or contradiction. In this version of the task\n(Task B), the input consists of the full document.\n", "dataset_name": "kiddothe2b/contract-nli"}}, "tags": [], "is_gated": false}, "bigscience/xP3all": {"dataset_name": "bigscience/xP3all", "description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.", "downloads": 215, "configs": {"ak": {"config_name": "ak", "sample_row": "{\"inputs\": \"\\\"Kpon\\\\u0254z\\\\u0254\\\\u0301wa\\\\u0301t\\\\u0254\\\\u0301 l\\\\u0...\", \"targets\": \"\\\"Apolisifo\\\\u2184 kae\\\\u025b s\\\\u025b \\\\u025bte s\\\\u025...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ar": {"config_name": "ar", "sample_row": "{\"inputs\": \"\\\"I wonder \\\\u0645\\\\u0627 \\\\u0647\\\\u064a \\\\u0639\\\\u0627\\\\u...\", \"targets\": \"\\\"\\\\u0623\\\\u0630\\\\u0631\\\\u0628\\\\u064a\\\\u062c\\\\u0627\\\\u0646 ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "as": {"config_name": "as", "sample_row": "{\"inputs\": \"\\\"W\\\\u00ebr gi jub gi ci suufu pong bi fukki meetar ...\", \"targets\": \"\\\"\\\\u09a6\\\\u09b2\\\\u0999\\\\u09f0 \\\\u09a4\\\\u09b2\\\\u09f0 \\\\u098...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "bm": {"config_name": "bm", "sample_row": "{\"inputs\": \"\\\"\\\\u0a2e\\\\u0a3e \\\\u0a39\\\\u0a3e\\\\u0a02\\\\u0a17-\\\\u0a15\\\\u0a3...\", \"targets\": \"\\\"A bangelen Hong Kong, Ma ye kalank\\\\u0190 New York...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "bn": {"config_name": "bn", "sample_row": "{\"inputs\": \"\\\"A text in Kinyarwanda: Niba udafite amayinite ya ...\", \"targets\": \"\\\"\\\\u0986\\\\u09aa\\\\u09a8\\\\u09be\\\\u09b0 \\\\u09ab\\\\u09cb\\\\u09a8...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ca": {"config_name": "ca", "sample_row": "{\"inputs\": \"\\\"Selile eza eloko ya moke koleka mpe oyo esalaka, ...\", \"targets\": \"\\\"Una c\\\\u00e8l\\\\u00b7lula \\\\u00e9s la unitat estructu...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "code": {"config_name": "code", "sample_row": "{\"inputs\": \"\\\"A few years ago, Hitagi encountered a giant crab,...\", \"targets\": \"\\\"\\\\n#include \\\\nusing namespace std;\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "en": {"config_name": "en", "sample_row": "{\"inputs\": \"\\\"Construct a circle with radius r{\\\\\\\\displaystyle r...\", \"targets\": \"\\\"Las coordenadas polares de cualquier punto P{\\\\\\\\di...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "es": {"config_name": "es", "sample_row": "{\"inputs\": \"\\\"Creemos firmemente que la transversalidad entre e...\", \"targets\": \"\\\"Creemos firmemente que las sinergias entre los ca...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "eu": {"config_name": "eu", "sample_row": "{\"inputs\": \"\\\"A text in Bambara: Kabini nin adamaden nana Galap...\", \"targets\": \"\\\"Gizakiak Galapagoetara iritsi zirenetik, ugaztun ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "fon": {"config_name": "fon", "sample_row": "{\"inputs\": \"\\\"Yidlanzana leziqhingi elineziqhingi ezingu-15 ezi...\", \"targets\": \"\\\"Kpl\\\\u00e9kpl\\\\u00e9 t\\\\u0254t\\\\u025bnt\\\\u00ednto 15 w...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "fr": {"config_name": "fr", "sample_row": "{\"inputs\": \"\\\"Text in Kinyarwanda: \\\\u2190 Covid-19: OMS yagiriy...\", \"targets\": \"\\\"Covid-19: l'OMS appelle \\\\u00e0 restreindre l'acc\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "gu": {"config_name": "gu", "sample_row": "{\"inputs\": \"\\\"W\\\\u2184sii great pyramid no de daa obuo k\\\\u03b5se...\", \"targets\": \"\\\"\\\\u0aae\\\\u0ab9\\\\u0abe\\\\u0aa8 \\\\u0aaa\\\\u0abf\\\\u0ab0\\\\u0abe...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "hi": {"config_name": "hi", "sample_row": "{\"inputs\": \"\\\"Article in Indonesian: Jika Anda tidak ingin perg...\", \"targets\": \"\\\"\\\\u0932\\\\u094b\\\\u0917\\\\u094b\\\\u0902 \\\\u0915\\\\u094b \\\\u091...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "id": {"config_name": "id", "sample_row": "{\"inputs\": \"\\\"Article in Spanish: Podr\\\\u00e1s usar una herramie...\", \"targets\": \"\\\"Buka terminal di dalam komputer. Pasang lshw (jik...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ig": {"config_name": "ig", "sample_row": "{\"inputs\": \"\\\"Text in Tswana: Tshenolo Ntheetsang\\\\nTranslation ...\", \"targets\": \"\\\"Ya mere, na-ege nt\\\\u1ecb ihe ga-ekpughe .\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ki": {"config_name": "ki", "sample_row": "{\"inputs\": \"\\\"Com isso, os jogadores poder\\\\u00e3o controlar a\\\\u...\", \"targets\": \"\\\"\\\\u0169nd\\\\u0169 \\\\u0169cio n\\\\u0129 \\\\u0169r\\\\u0129hot...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "kn": {"config_name": "kn", "sample_row": "{\"inputs\": \"\\\"A text in Tsonga: Xiphiqo xin\\\\u2019wana xa tilens...\", \"targets\": \"\\\"\\\\u0c9c\\\\u0cc2\\\\u0cae\\\\u0ccd \\\\u0cb2\\\\u0cc6\\\\u0ca8\\\\u0ccd...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "lg": {"config_name": "lg", "sample_row": "{\"inputs\": \"\\\"\\\\u0b89\\\\u0bb2\\\\u0b95 \\\\u0b93\\\\u0b9f\\\\u0bc1\\\\u0bae\\\\u0bcd...\", \"targets\": \"\\\"Abalambuzi babaddusi okwetoolola ensi yonna, aba ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ln": {"config_name": "ln", "sample_row": "{\"inputs\": \"\\\"A text in Telugu: \\\\\\\"\\\\\\\"\\\\\\\"\\\\u0c26\\\\u0c3e\\\\u0c28\\\\u0c3f ...\", \"targets\": \"\\\"Soki tolobeli ndenge eza mpasi na kokoma kuna, ba...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ml": {"config_name": "ml", "sample_row": "{\"inputs\": \"\\\"A text in Twi: Binary akontahy\\\\u025bde betumi afa...\", \"targets\": \"\\\"\\\\u0d2c\\\\u0d48\\\\u0d28\\\\u0d31\\\\u0d3f \\\\u0d28\\\\u0d2e\\\\u0d4d...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "mr": {"config_name": "mr", "sample_row": "{\"inputs\": \"\\\"\\\\u06a9\\\\u06cc\\\\u0644\\\\u0634\\\\u06cc\\\\u0645 \\\\u0627\\\\u0648...\", \"targets\": \"\\\"\\\\u0915\\\\u0945\\\\u0932\\\\u094d\\\\u0936\\\\u093f\\\\u092f\\\\u092e ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ne": {"config_name": "ne", "sample_row": "{\"inputs\": \"\\\"Tabax yu bari da\\\\u00f1oo rafet lool te soo koy xo...\", \"targets\": \"\\\"\\\\u0927\\\\u0947\\\\u0930\\\\u0948 \\\\u092d\\\\u0935\\\\u0928\\\\u0939...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "nso": {"config_name": "nso", "sample_row": "{\"inputs\": \"\\\"A text in Twi: 1920s mu no,na \\\\u0254manfo a w\\\\u02...\", \"targets\": \"\\\"Ka nako ya bo 1920, dikgopolo t\\\\u0161eo di hlolag...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ny": {"config_name": "ny", "sample_row": "{\"inputs\": \"\\\"Text in Xhosa: Ukutshata nako kuyingozi.\\\\nTransla...\", \"targets\": \"\\\"Kukwatira kumakhalanso koopsa.\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "or": {"config_name": "or", "sample_row": "{\"inputs\": \"\\\"Ini berarti Anda dapat mengunjungi kota bersejara...\", \"targets\": \"\\\"\\\\u0b0f\\\\u0b39\\\\u0b3e\\\\u0b30 \\\\u0b05\\\\u0b30\\\\u0b4d\\\\u0b25...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "pa": {"config_name": "pa", "sample_row": "{\"inputs\": \"\\\"Abaturage bose b\\\\u2019Umujyi wa Vatikani ni Abany...\", \"targets\": \"\\\"\\\\u0a35\\\\u0a48\\\\u0a1f\\\\u0a40\\\\u0a15\\\\u0a28 \\\\u0a38\\\\u0a3f...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "pt": {"config_name": "pt", "sample_row": "{\"inputs\": \"\\\"Tubuh membutuhkan air agar bisa berfungsi. Jika A...\", \"targets\": \"\\\"Beba muita \\\\u00e1gua Pratique mais atividade f\\\\u0...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "rn": {"config_name": "rn", "sample_row": "{\"inputs\": \"\\\"\\\\u09af\\\\u09a6\\\\u09bf \\\\u0986\\\\u09aa\\\\u09c1\\\\u09a8\\\\u09bf...\", \"targets\": \"\\\"Iyo mwiyumva mwobishobora cane, fata ako karyo ku...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "rw": {"config_name": "rw", "sample_row": "{\"inputs\": \"\\\"Text in Yoruba: L\\\\u1ecdgan ti Apple logo farahan,...\", \"targets\": \"\\\"Usubire kuri flash disck yawe urasanga data zagar...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "sn": {"config_name": "sn", "sample_row": "{\"inputs\": \"\\\"Text in Swahili (individual language): Ikiwa mwaj...\", \"targets\": \"\\\"kana waunoshandira ane kodzero pfuma zvedzidzo uk...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "st": {"config_name": "st", "sample_row": "{\"inputs\": \"\\\"\\\\u0c15\\\\u0c4a\\\\u0c30\\\\u0c3f\\\\u0c2f\\\\u0c30\\\\u0c4d \\\\u0c15...\", \"targets\": \"\\\"Dik\\\\u2019hampani tsa ho tsamaisa dintho di lefuwa...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "sw": {"config_name": "sw", "sample_row": "{\"inputs\": \"\\\"I wonder Mama wa rais wa Japani ni nani?\\\\n\\\\nCan y...\", \"targets\": \"\\\"Vita kati ya Japani na Urusi ya 1905 ilipiganiwa ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ta": {"config_name": "ta", "sample_row": "{\"inputs\": \"\\\"ITokyo kuzoba ilona kuphela idolobha lase-Asia el...\", \"targets\": \"\\\"1964\\\\u0bb2\\\\u0bcd \\\\u0bb5\\\\u0bbf\\\\u0bb3\\\\u0bc8\\\\u0baf\\\\u...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "te": {"config_name": "te", "sample_row": "{\"inputs\": \"\\\"Rumor has it that 1828-1835 \\\\u0c2e\\\\u0c27\\\\u0c4d\\\\u0...\", \"targets\": \"\\\"\\\\u0c32\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u0c21\\\\u0c41 \\\\u0c35\\\\u0c3f...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tn": {"config_name": "tn", "sample_row": "{\"inputs\": \"\\\"L'any 1995 se'l va votar com a millor jugador de ...\", \"targets\": \"\\\"Ka ngwaga wa 1995 o ne a tlhophiwa jaaka motshame...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ts": {"config_name": "ts", "sample_row": "{\"inputs\": \"\\\"Text in Kinyarwanda: Mbese mu ihunga ryawe uhagaz...\", \"targets\": \"\\\"ri kwihi ribye ra n'wina ra vutumbelo ke?\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tum": {"config_name": "tum", "sample_row": "{\"inputs\": \"\\\"W\\\\u011bma s\\\\u025b\\\\u0301d\\\\u00f3 tom\\\\u025b xw\\\\u00e9...\", \"targets\": \"\\\"Makampani gha kuyegha katundu ghakulipirika makol...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tw": {"config_name": "tw", "sample_row": "{\"inputs\": \"\\\"La compra libre de impuestos aduaneros ofrece una...\", \"targets\": \"\\\"Duty free shopping y\\\\u025b hokwan a wode b\\\\u025bt...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ur": {"config_name": "ur", "sample_row": "{\"inputs\": \"\\\"Given the below title and summary of an article, ...\", \"targets\": \"\\\"\\\\u0631\\\\u0627\\\\u0648\\\\u0644\\\\u067e\\\\u0646\\\\u0688\\\\u06cc ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "vi": {"config_name": "vi", "sample_row": "{\"inputs\": \"\\\"O vermelho \\\\u00e9 uma cor prim\\\\u00e1ria, e por is...\", \"targets\": \"\\\"Hi\\\\u1ec3u r\\\\u1eb1ng b\\\\u1ea1n kh\\\\u00f4ng th\\\\u1ec3 ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "wo": {"config_name": "wo", "sample_row": "{\"inputs\": \"\\\"Nta gabishwa ry'igihuhusi ca tsunami rirasohorwa,...\", \"targets\": \"\\\"Amul benn \\\\u00e0ddu bu\\\\u00f1u def ci tsunami buy ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "xh": {"config_name": "xh", "sample_row": "{\"inputs\": \"\\\"Text in Zulu: Umhlanga Guest house opinie, Umhlan...\", \"targets\": \"\\\"Umhlanga Guest house, Umhlanga Rocks\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "yo": {"config_name": "yo", "sample_row": "{\"inputs\": \"\\\"A text in Malayalam: \\\\u0d35\\\\u0d3f\\\\u0d32\\\\u0d2f\\\\u0d...\", \"targets\": \"\\\"W\\\\u1ecd\\\\u0301n ti d\\\\u00e1 \\\\u1ecd\\\\u0300p\\\\u1ecd\\\\u03...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "zh": {"config_name": "zh", "sample_row": "{\"inputs\": \"\\\"\\\\u7537\\\\uff1a\\\\u73b0\\\\u5728\\\\u4e3a\\\\u5927\\\\u5bb6\\\\u4ecb\\\\...\", \"targets\": \"\\\"\\\\u80fd\\\\u6ee1\\\\u8db3\\\\u4e0d\\\\u540c\\\\u8bfb\\\\u8005\\\\u7684\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "zu": {"config_name": "zu", "sample_row": "{\"inputs\": \"\\\"Text in Ganda: mulinunulibwa awatali ffeeza.\\\\nTra...\", \"targets\": \"\\\"uyonikeza mahhala.\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:multilingual", "language:ak", "language:ar", "language:as", "language:bm", "language:bn", "language:ca", "language:code", "language:en", "language:es", "language:eu", "language:fon", "language:fr", "language:gu", "language:hi", "language:id", "language:ig", "language:ki", "language:kn", "language:lg", "language:ln", "language:ml", "language:mr", "language:ne", "language:nso", "language:ny", "language:or", "language:pa", "language:pt", "language:rn", "language:rw", "language:sn", "language:st", "language:sw", "language:ta", "language:te", "language:tn", "language:ts", "language:tum", "language:tw", "language:ur", "language:vi", "language:wo", "language:xh", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "allenai/multi_lexsum": {"dataset_name": "allenai/multi_lexsum", "description": "Multi-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities.", "downloads": 260, "configs": {"v20220616": {"config_name": "v20220616", "sample_row": "{\"id\": \"\\\"EE-AL-0045\\\"\", \"sources\": \"[\\\"Case 1:05-cv-00530-D Document 1-1 Filed 09/19/20...\", \"summary/long\": \"\\\"On September 15, 2005, the Equal Employment Oppor...\", \"summary/short\": \"\\\"Equal Employment Opportunity Commission brought a...\", \"summary/tiny\": \"null\"}", "columns": ["id", "sources", "summary/long", "summary/short", "summary/tiny"], "columns_mapping": {"id": "id", "sources": "sources", "summary/long": "summary/long", "summary/short": "summary/short", "summary/tiny": "summary/tiny"}, "dataset_description": "\nMulti-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities. \n", "dataset_name": "allenai/multi_lexsum"}, "v20230518": {"config_name": "v20230518", "sample_row": "{\"id\": \"\\\"EE-AL-0045\\\"\", \"sources\": \"[\\\"Case 1:05-cv-00530-D Document 1-1 Filed 09/19/20...\", \"sources_metadata.doc_id\": \"[\\\"EE-AL-0045-0001\\\", \\\"EE-AL-0045-0002\\\", \\\"EE-AL-0045...\", \"sources_metadata.doc_type\": \"[\\\"Complaint\\\", \\\"Complaint\\\", \\\"Settlement Agreement\\\",...\", \"sources_metadata.doc_title\": \"[\\\"Complaint\\\", \\\"Complaint in Intervention\\\", \\\"Consen...\", \"sources_metadata.parser\": \"[\\\"pyxpdf\\\", \\\"pyxpdf\\\", \\\"pyxpdf\\\", \\\"pyxpdf\\\"]\", \"sources_metadata.is_ocr\": \"[true, true, true, false]\", \"sources_metadata.url\": \"[\\\"https://clearinghouse.net/doc/22034\\\", \\\"https://c...\", \"summary/long\": \"\\\"On September 15, 2005, the Equal Employment Oppor...\", \"summary/short\": \"\\\"Equal Employment Opportunity Commission brought a...\", \"summary/tiny\": \"null\", \"case_metadata.case_name\": \"\\\"EEOC v. House of Philadelphia Center, Inc.\\\"\", \"case_metadata.case_type\": \"\\\"Equal Employment\\\"\", \"case_metadata.filing_date\": \"\\\"2005-09-15\\\"\", \"case_metadata.filing_year\": \"\\\"2005\\\"\", \"case_metadata.case_ongoing\": \"\\\"No\\\"\", \"case_metadata.case_ongoing_record_time\": \"\\\"2022-05-19\\\"\", \"case_metadata.closing_year\": \"\\\"2010\\\"\", \"case_metadata.order_start_year\": \"\\\"2007\\\"\", \"case_metadata.order_end_year\": \"\\\"2010\\\"\", \"case_metadata.defendant_payment\": \"\\\"$8,000\\\"\", \"case_metadata.class_action_sought\": \"\\\"No\\\"\", \"case_metadata.class_action_granted\": \"\\\"Not sought\\\"\", \"case_metadata.attorney_orgs\": \"[\\\"EEOC\\\"]\", \"case_metadata.prevailing_party\": \"\\\"Plaintiff\\\"\", \"case_metadata.plaintiff_types\": \"[\\\"Private Plaintiff\\\", \\\"EEOC Plaintiff\\\"]\", \"case_metadata.plaintiff_description\": \"\\\"Equal Employment Opportunity Commission filing on...\", \"case_metadata.constitutional_clauses\": \"[]\", \"case_metadata.causes_of_action\": \"[\\\"Title VII (including PDA), 42 U.S.C. \\\\u00a7 200...\", \"case_metadata.summary_authors\": \"[\\\"22120\\\"]\", \"case_metadata.case_url\": \"\\\"https://clearinghouse.net/case/6817\\\"\"}", "columns": ["id", "sources", "sources_metadata_doc_id", "sources_metadata_doc_type", "sources_metadata_doc_title", "sources_metadata_parser", "sources_metadata_is_ocr", "sources_metadata_url", "summary/long", "summary/short", "summary/tiny", "case_metadata_case_name", "case_metadata_case_type", "case_metadata_filing_date", "case_metadata_filing_year", "case_metadata_case_ongoing", "case_metadata_case_ongoing_record_time", "case_metadata_closing_year", "case_metadata_order_start_year", "case_metadata_order_end_year", "case_metadata_defendant_payment", "case_metadata_class_action_sought", "case_metadata_class_action_granted", "case_metadata_attorney_orgs", "case_metadata_prevailing_party", "case_metadata_plaintiff_types", "case_metadata_plaintiff_description", "case_metadata_constitutional_clauses", "case_metadata_causes_of_action", "case_metadata_summary_authors", "case_metadata_case_url"], "columns_mapping": {"id": "id", "sources": "sources", "sources_metadata.doc_id": "sources_metadata_doc_id", "sources_metadata.doc_type": "sources_metadata_doc_type", "sources_metadata.doc_title": "sources_metadata_doc_title", "sources_metadata.parser": "sources_metadata_parser", "sources_metadata.is_ocr": "sources_metadata_is_ocr", "sources_metadata.url": "sources_metadata_url", "summary/long": "summary/long", "summary/short": "summary/short", "summary/tiny": "summary/tiny", "case_metadata.case_name": "case_metadata_case_name", "case_metadata.case_type": "case_metadata_case_type", "case_metadata.filing_date": "case_metadata_filing_date", "case_metadata.filing_year": "case_metadata_filing_year", "case_metadata.case_ongoing": "case_metadata_case_ongoing", "case_metadata.case_ongoing_record_time": "case_metadata_case_ongoing_record_time", "case_metadata.closing_year": "case_metadata_closing_year", "case_metadata.order_start_year": "case_metadata_order_start_year", "case_metadata.order_end_year": "case_metadata_order_end_year", "case_metadata.defendant_payment": "case_metadata_defendant_payment", "case_metadata.class_action_sought": "case_metadata_class_action_sought", "case_metadata.class_action_granted": "case_metadata_class_action_granted", "case_metadata.attorney_orgs": "case_metadata_attorney_orgs", "case_metadata.prevailing_party": "case_metadata_prevailing_party", "case_metadata.plaintiff_types": "case_metadata_plaintiff_types", "case_metadata.plaintiff_description": "case_metadata_plaintiff_description", "case_metadata.constitutional_clauses": "case_metadata_constitutional_clauses", "case_metadata.causes_of_action": "case_metadata_causes_of_action", "case_metadata.summary_authors": "case_metadata_summary_authors", "case_metadata.case_url": "case_metadata_case_url"}, "dataset_description": "\nMulti-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities. \n", "dataset_name": "allenai/multi_lexsum"}}, "tags": ["task_categories:summarization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "tau/sled": {"dataset_name": "tau/sled", "description": "Efficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.", "downloads": 21207, "configs": {"summ_screen_fd": {"config_name": "summ_screen_fd", "sample_row": "{\"id\": \"\\\"fd_Charmed_05x13\\\"\", \"pid\": \"\\\"fd_Charmed_05x13_0\\\"\", \"input\": \"\\\"[Scene: Manor. Paige's room. Paige is there lying...\", \"output\": \"\\\"When residue left from demonic vanquishes builds ...\"}", "columns": ["id", "pid", "input", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nSummScreenFD (Chen et al., 2021) is a summarization dataset in the domain of TV shows (e.g. Friends, Game of Thrones).\nGiven a transcript of a specific episode, the goal is to produce the episode's recap.\nThe original dataset is divided into two complementary subsets, based on the source of its community contributed transcripts. \nFor SCROLLS, we use the ForeverDreaming (FD) subset, as it incorporates 88 different shows, \nmaking it a more diverse alternative to the TV MegaSite (TMS) subset, which has only 10 shows. \nCommunity-authored recaps for the ForeverDreaming transcripts were collected from English Wikipedia and TVMaze.", "dataset_name": "tau/sled"}, "qasper": {"config_name": "qasper", "sample_row": "{\"id\": \"\\\"753990d0b621d390ed58f20c4d9e4f065f0dc672\\\"\", \"pid\": \"\\\"753990d0b621d390ed58f20c4d9e4f065f0dc672_0\\\"\", \"input\": \"\\\"Introduction\\\\nAffective events BIBREF0 are events...\", \"input_prefix\": \"\\\"What is the seed lexicon?\\\"\", \"output\": \"\\\"a vocabulary of positive and negative predicates ...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nQasper (Dasigi et al., 2021) is a question answering dataset over NLP papers filtered from the Semantic Scholar Open Research Corpus (S2ORC).\nQuestions were written by NLP practitioners after reading only the title and abstract of the papers, \nwhile another set of NLP practitioners annotated the answers given the entire document.\nQasper contains abstractive, extractive, and yes/no questions, as well as unanswerable ones.", "dataset_name": "tau/sled"}, "qmsum": {"config_name": "qmsum", "sample_row": "{\"id\": \"\\\"tr-sq-1\\\"\", \"pid\": \"\\\"tr-sq-1_0\\\"\", \"input\": \"\\\"Project Manager: Yep . Soon as I get this . Okay ...\", \"input_prefix\": \"\\\"How Did Project Manager and User Interface introd...\", \"output\": \"\\\"Project Manager introduced that the prototype inc...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nQMSum (Zhong et al., 2021) is a query-based summarization dataset, consisting of 232 meetings transcripts from multiple domains. \nThe corpus covers academic group meetings at the International Computer Science Institute and their summaries, industrial product meetings for designing a remote control, \nand committee meetings of the Welsh and Canadian Parliaments, dealing with a variety of public policy issues.\nAnnotators were tasked with writing queries about the broad contents of the meetings, as well as specific questions about certain topics or decisions, \nwhile ensuring that the relevant text for answering each query spans at least 200 words or 10 turns.", "dataset_name": "tau/sled"}, "narrative_qa": {"config_name": "narrative_qa", "sample_row": "{\"id\": \"\\\"39ab35eb8bdecda3cfd79433774fc63c7c699171_0\\\"\", \"pid\": \"\\\"39ab35eb8bdecda3cfd79433774fc63c7c699171_0_0\\\"\", \"input\": \"\\\"Produced by Charles Keller and David Widger\\\\n\\\\n\\\\n...\", \"input_prefix\": \"\\\"What does Sir Nigel come to New York looking for?...\", \"output\": \"\\\"an heiress\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nNarrativeQA (Ko\u010disk\u00fd et al., 2021) is an established question answering dataset over entire books from Project Gutenberg and movie scripts from different websites.\nAnnotators were given summaries of the books and scripts obtained from Wikipedia, and asked to generate question-answer pairs, \nresulting in about 30 questions and answers for each of the 1,567 books and scripts.\nThey were encouraged to use their own words rather then copying, and avoid asking yes/no questions or ones about the cast.\nEach question was then answered by an additional annotator, providing each question with two reference answers (unless both answers are identical)..", "dataset_name": "tau/sled"}, "gov_report": {"config_name": "gov_report", "sample_row": "{\"id\": \"\\\"crs_RL33819\\\"\", \"pid\": \"\\\"crs_RL33819_0\\\"\", \"input\": \"\\\"\\\\tMajor Developments in 2008\\\\n\\\\nOn December 17, 2...\", \"output\": \"\\\"Since the early 1960s, U.S. policy toward Cuba ha...\"}", "columns": ["id", "pid", "input", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\n@inproceedings{huang-etal-2021-efficient,\n title = \"Efficient Attentions for Long Document Summarization\",\n author = \"Huang, Luyang and\n Cao, Shuyang and\n Parulian, Nikolaus and\n Ji, Heng and\n Wang, Lu\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n month = jun,\n year = \"2021\",\n address = \"Online\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://aclanthology.org/2021.naacl-main.112\",\n doi = \"10.18653/v1/2021.naacl-main.112\",\n pages = \"1419--1436\",\n abstract = \"The quadratic computational and memory complexities of large Transformers have limited their scalability for long document summarization. In this paper, we propose Hepos, a novel efficient encoder-decoder attention with head-wise positional strides to effectively pinpoint salient information from the source. We further conduct a systematic study of existing efficient self-attentions. Combined with Hepos, we are able to process ten times more tokens than existing models that use full attentions. For evaluation, we present a new dataset, GovReport, with significantly longer documents and summaries. Results show that our models produce significantly higher ROUGE scores than competitive comparisons, including new state-of-the-art results on PubMed. Human evaluation also shows that our models generate more informative summaries with fewer unfaithful errors.\",\n}", "dataset_name": "tau/sled"}, "contract_nli": {"config_name": "contract_nli", "sample_row": "{\"id\": \"\\\"34_nda-11\\\"\", \"pid\": \"\\\"34_nda-11_0\\\"\", \"input\": \"\\\"NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\\\\nThi...\", \"input_prefix\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"output\": \"\\\"Not mentioned\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nContract NLI (Koreeda and Manning, 2021) is a natural language inference dataset in the legal domain.\nGiven a non-disclosure agreement (the premise), the task is to predict whether a particular legal statement (the hypothesis) is entailed, not entailed (neutral), or cannot be entailed (contradiction) from the contract.\nThe NDAs were manually picked after simple filtering from the Electronic Data Gathering, Analysis, and Retrieval system (EDGAR) and Google.\nThe dataset contains a total of 607 contracts and 17 unique hypotheses, which were combined to produce the dataset's 10,319 examples.", "dataset_name": "tau/sled"}, "quality": {"config_name": "quality", "sample_row": "{\"id\": \"\\\"52995_I3M5VUMM_1\\\"\", \"pid\": \"\\\"52995_I3M5VUMM_1_0\\\"\", \"input\": \"\\\"SPACEMAN ON A SPREE\\\\n\\\\n\\\\n\\\\n\\\\n BY MACK REYNOLDS\\\\...\", \"input_prefix\": \"\\\"Why is Si retirement so significant to the Space ...\", \"output\": \"\\\"Training new spacemen is costly and time consumin...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n@article{pang2021quality,\n title={{QuALITY}: Question Answering with Long Input Texts, Yes!},\n author={Pang, Richard Yuanzhe and Parrish, Alicia and Joshi, Nitish and Nangia, Nikita and Phang, Jason and Chen, Angelica and Padmakumar, Vishakh and Ma, Johnny and Thompson, Jana and He, He and Bowman, Samuel R.},\n journal={arXiv preprint arXiv:2112.08608},\n year={2021}\n}\n", "dataset_name": "tau/sled"}, "squad": {"config_name": "squad", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"Architecturally, the school has a Catholic charac...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "squad_shuffled_distractors": {"config_name": "squad_shuffled_distractors", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"John initially adopted a defensive posture simila...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "squad_ordered_distractors": {"config_name": "squad_ordered_distractors", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"Architecturally, the school has a Catholic charac...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "hotpotqa": {"config_name": "hotpotqa", "sample_row": "{\"id\": \"\\\"5a7a06935542990198eaf050\\\"\", \"pid\": \"\\\"5a7a06935542990198eaf050_0\\\"\", \"input\": \"\\\"Arthur's Magazine (1844\\\\u20131846) was an America...\", \"input_prefix\": \"\\\"Which magazine was started first Arthur's Magazin...\", \"output\": \"\\\"Arthur's Magazine\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:\n(1) the questions require finding and reasoning over multiple supporting documents to answer;\n(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;\n(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;\n(4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "dataset_name": "tau/sled"}, "hotpotqa_second_only": {"config_name": "hotpotqa_second_only", "sample_row": "{\"id\": \"\\\"5a7a06935542990198eaf050\\\"\", \"pid\": \"\\\"5a7a06935542990198eaf050_0\\\"\", \"input\": \"\\\"First for Women is a woman's magazine published b...\", \"input_prefix\": \"\\\"Which magazine was started first Arthur's Magazin...\", \"output\": \"\\\"Arthur's Magazine\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:\n(1) the questions require finding and reasoning over multiple supporting documents to answer;\n(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;\n(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;\n(4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "dataset_name": "tau/sled"}}, "tags": ["task_categories:question-answering", "task_categories:summarization", "task_categories:text-generation", "task_ids:multiple-choice-qa", "task_ids:natural-language-inference", "language:en", "multi-hop-question-answering", "query-based-summarization", "long-texts"], "is_gated": false}, "NbAiLab/norwegian-paws-x": {"dataset_name": "NbAiLab/norwegian-paws-x", "description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 18, "configs": {"nb": {"config_name": "nb", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"I Paris i oktober 1560 m\\\\u00f8tte han hemmelig de...\", \"sentence2\": \"\\\"I oktober 1560 m\\\\u00f8tte han hemmelig den engels...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "NbAiLab/norwegian-paws-x"}, "nn": {"config_name": "nn", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"I Paris i oktober 1560 m\\\\u00f8tte han i l\\\\u00f8yn...\", \"sentence2\": \"\\\"I oktober 1560 m\\\\u00f8tte han i l\\\\u00f8ynd den en...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "NbAiLab/norwegian-paws-x"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:extended|other-paws", "language:nb", "language:nn"], "is_gated": false}, "jakartaresearch/google-play-review": {"dataset_name": "jakartaresearch/google-play-review", "description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.", "downloads": 142, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\" Halo\\\\n blibli. Sedikit saran untuk gratis ongkir...\", \"label\": \"\\\"pos\\\"\", \"stars\": \"4\"}", "columns": ["text", "label", "stars"], "columns_mapping": {"text": "text", "label": "label", "stars": "stars"}, "dataset_description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.\n", "dataset_name": "jakartaresearch/google-play-review"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:id", "sentiment", "google-play", "indonesian"], "is_gated": false}, "jakartaresearch/news-title-gen": {"dataset_name": "jakartaresearch/news-title-gen", "description": "This dataset is built for generating text for news title.", "downloads": 23, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"Muncul Temuan Baru, Virus Corona Berasal dari Lab...\", \"link\": \"\\\"https://www.tribunnews.com/topic/virus-corona\\\"\", \"date\": \"\\\"2020-02-21\\\"\"}", "columns": ["title", "link", "date"], "columns_mapping": {"title": "title", "link": "link", "date": "date"}, "dataset_description": "This dataset is built for generating text for news title.\n", "dataset_name": "jakartaresearch/news-title-gen"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:id", "newspapers", "title", "news"], "is_gated": false}, "m3/multi_domain_document_classification": {"dataset_name": "m3/multi_domain_document_classification", "description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "downloads": 23, "configs": {"chemprot": {"config_name": "chemprot", "sample_row": "{\"text\": \"\\\"<< Epidermal growth factor receptor >> inhibitors...\", \"label\": \"8\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "citation_intent": {"config_name": "citation_intent", "sample_row": "{\"text\": \"\\\"Thus , over the past few years , along with advan...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "hyperpartisan_news": {"config_name": "hyperpartisan_news", "sample_row": "{\"text\": \"\\\"As seen on The Five Police Group Boycotts Ben &am...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "rct_sample": {"config_name": "rct_sample", "sample_row": "{\"text\": \"\\\"Use of the mobile application was greater than in...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "sciie": {"config_name": "sciie", "sample_row": "{\"text\": \"\\\"The agreement in question involves number in [[ n...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "amcd": {"config_name": "amcd", "sample_row": "{\"text\": \"\\\"It has a modern look, and doesn't take up that mu...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "yelp_review": {"config_name": "yelp_review", "sample_row": "{\"text\": \"\\\"Everything was perfect--the service, the timeline...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_irony": {"config_name": "tweet_eval_irony", "sample_row": "{\"text\": \"\\\"seeing ppl walking w/ crutches makes me really ex...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_hate": {"config_name": "tweet_eval_hate", "sample_row": "{\"text\": \"\\\"@user nice new signage. Are you not concerned by ...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_emotion": {"config_name": "tweet_eval_emotion", "sample_row": "{\"text\": \"\\\"\\\\u201cWorry is a down payment on a problem you ma...\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}}, "tags": [], "is_gated": false}, "jakartaresearch/semeval-absa": {"dataset_name": "jakartaresearch/semeval-absa", "description": "This dataset is built as a playground for aspect-based sentiment analysis.", "downloads": 90, "configs": {"laptop": {"config_name": "laptop", "sample_row": "{\"id\": \"\\\"2339\\\"\", \"text\": \"\\\"I charge it at night and skip taking the cord wit...\", \"aspects.term\": \"[\\\"cord\\\", \\\"battery life\\\"]\", \"aspects.polarity\": \"[\\\"neutral\\\", \\\"positive\\\"]\", \"aspects.from\": \"[41, 74]\", \"aspects.to\": \"[45, 86]\"}", "columns": ["id", "text", "aspects_term", "aspects_polarity", "aspects_from", "aspects_to"], "columns_mapping": {"id": "id", "text": "text", "aspects.term": "aspects_term", "aspects.polarity": "aspects_polarity", "aspects.from": "aspects_from", "aspects.to": "aspects_to"}, "dataset_description": "This dataset is built as a playground for aspect-based sentiment analysis.\n", "dataset_name": "jakartaresearch/semeval-absa"}, "restaurant": {"config_name": "restaurant", "sample_row": "{\"id\": \"\\\"3121\\\"\", \"text\": \"\\\"But the staff was so horrible to us.\\\"\", \"aspects.term\": \"[\\\"staff\\\"]\", \"aspects.polarity\": \"[\\\"negative\\\"]\", \"aspects.from\": \"[8]\", \"aspects.to\": \"[13]\", \"category.category\": \"[\\\"service\\\"]\", \"category.polarity\": \"[\\\"negative\\\"]\"}", "columns": ["id", "text", "aspects_term", "aspects_polarity", "aspects_from", "aspects_to", "category_category", "category_polarity"], "columns_mapping": {"id": "id", "text": "text", "aspects.term": "aspects_term", "aspects.polarity": "aspects_polarity", "aspects.from": "aspects_from", "aspects.to": "aspects_to", "category.category": "category_category", "category.polarity": "category_polarity"}, "dataset_description": "This dataset is built as a playground for aspect-based sentiment analysis.\n", "dataset_name": "jakartaresearch/semeval-absa"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "aspect-based-sentiment-analysis", "semeval", "semeval2015"], "is_gated": false}, "jonathanli/echr": {"dataset_name": "jonathanli/echr", "description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".", "downloads": 23, "configs": {"non-anon": {"config_name": "non-anon", "sample_row": "{\"itemid\": \"\\\"001-60714\\\"\", \"languageisocode\": \"\\\"ENG\\\"\", \"respondent\": \"\\\"FIN\\\"\", \"branch\": \"\\\"CHAMBER\\\"\", \"date\": \"2002\", \"docname\": \"\\\"CASE OF PIETILAINEN v. FINLAND\\\"\", \"importance\": \"4\", \"conclusion\": \"\\\"Violation of Art. 6-1;Non-pecuniary damage - fina...\", \"judges\": \"\\\"Nicolas Bratza\\\"\", \"text\": \"[\\\"The applicant was born in 1943 and lives in Lauk...\", \"violated_articles\": \"[\\\"6\\\"]\", \"violated_paragraphs\": \"[\\\"6-1\\\"]\", \"violated_bulletpoints\": \"[]\", \"non_violated_articles\": \"[]\", \"non_violated_paragraphs\": \"[]\", \"non_violated_bulletpoints\": \"[]\", \"violated\": \"true\"}", "columns": ["itemid", "languageisocode", "respondent", "branch", "date", "docname", "importance", "conclusion", "judges", "text", "violated_articles", "violated_paragraphs", "violated_bulletpoints", "non_violated_articles", "non_violated_paragraphs", "non_violated_bulletpoints", "violated"], "columns_mapping": {"itemid": "itemid", "languageisocode": "languageisocode", "respondent": "respondent", "branch": "branch", "date": "date", "docname": "docname", "importance": "importance", "conclusion": "conclusion", "judges": "judges", "text": "text", "violated_articles": "violated_articles", "violated_paragraphs": "violated_paragraphs", "violated_bulletpoints": "violated_bulletpoints", "non_violated_articles": "non_violated_articles", "non_violated_paragraphs": "non_violated_paragraphs", "non_violated_bulletpoints": "non_violated_bulletpoints", "violated": "violated"}, "dataset_description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".\n", "dataset_name": "jonathanli/echr"}, "anon": {"config_name": "anon", "sample_row": "{\"itemid\": \"\\\"001-60714\\\"\", \"languageisocode\": \"\\\"ENG\\\"\", \"respondent\": \"\\\"FIN\\\"\", \"branch\": \"\\\"CHAMBER\\\"\", \"date\": \"2002\", \"docname\": \"\\\"CASE OF PIETILAINEN v. FINLAND\\\"\", \"importance\": \"4\", \"conclusion\": \"\\\"Violation of Art. 6-1;Non-pecuniary damage - fina...\", \"judges\": \"\\\"Nicolas Bratza\\\"\", \"text\": \"[\\\"The applicant was born in DATE and lives in GPE ...\", \"violated_articles\": \"[\\\"6\\\"]\", \"violated_paragraphs\": \"[\\\"6-1\\\"]\", \"violated_bulletpoints\": \"[]\", \"non_violated_articles\": \"[]\", \"non_violated_paragraphs\": \"[]\", \"non_violated_bulletpoints\": \"[]\", \"violated\": \"true\"}", "columns": ["itemid", "languageisocode", "respondent", "branch", "date", "docname", "importance", "conclusion", "judges", "text", "violated_articles", "violated_paragraphs", "violated_bulletpoints", "non_violated_articles", "non_violated_paragraphs", "non_violated_bulletpoints", "violated"], "columns_mapping": {"itemid": "itemid", "languageisocode": "languageisocode", "respondent": "respondent", "branch": "branch", "date": "date", "docname": "docname", "importance": "importance", "conclusion": "conclusion", "judges": "judges", "text": "text", "violated_articles": "violated_articles", "violated_paragraphs": "violated_paragraphs", "violated_bulletpoints": "violated_bulletpoints", "non_violated_articles": "non_violated_articles", "non_violated_paragraphs": "non_violated_paragraphs", "non_violated_bulletpoints": "non_violated_bulletpoints", "violated": "violated"}, "dataset_description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".\n", "dataset_name": "jonathanli/echr"}}, "tags": [], "is_gated": false}, "cjvt/sentinews": {"dataset_name": "cjvt/sentinews", "description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).", "downloads": 47, "configs": {"document_level": {"config_name": "document_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\"}", "columns": ["nid", "content", "sentiment"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}, "paragraph_level": {"config_name": "paragraph_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\", \"pid\": \"1\"}", "columns": ["nid", "content", "sentiment", "pid"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment", "pid": "pid"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}, "sentence_level": {"config_name": "sentence_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\", \"pid\": \"1\", \"sid\": \"1\"}", "columns": ["nid", "content", "sentiment", "pid", "sid"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment", "pid": "pid", "sid": "sid"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:sl", "slovenian sentiment", "news articles"], "is_gated": false}, "jakartaresearch/indo-movie-subtitle": {"dataset_name": "jakartaresearch/indo-movie-subtitle", "description": "This dataset is built as a playground for analyzing text on movie subtitle", "downloads": 23, "configs": {"default": {"config_name": "default", "sample_row": "{\"movie_title\": \"\\\"Bank.Robbers.The.Last.Great.Heist\\\"\", \"order\": \"\\\"5\\\"\", \"duration\": \"\\\"00:00:42,583 --> 00:00:46,375\\\"\", \"text\": \"\\\"adalah perilaku yang dinilai\\\\noleh hati nuranimu....\"}", "columns": ["movie_title", "order", "duration", "text"], "columns_mapping": {"movie_title": "movie_title", "order": "order", "duration": "duration", "text": "text"}, "dataset_description": "This dataset is built as a playground for analyzing text on movie subtitle\n", "dataset_name": "jakartaresearch/indo-movie-subtitle"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:id", "movie", "subtitle", "indonesian"], "is_gated": false}, "yhavinga/cnn_dailymail_dutch": {"dataset_name": "yhavinga/cnn_dailymail_dutch", "description": "CNN/DailyMail non-anonymized summarization dataset, translated to Dutch with ccmatrix.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "downloads": 31, "configs": {"3.0.0": {"config_name": "3.0.0", "sample_row": "{\"article\": \"\\\"LONDEN, Engeland (Reuters) - Harry Potter-ster Da...\", \"highlights\": \"\\\"Harry Potter-ster Daniel Radcliffe krijgt \\\\u00a3 ...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset, translated to Dutch with ccmatrix.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "yhavinga/cnn_dailymail_dutch"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:nl"], "is_gated": false}, "SLPL/naab": {"dataset_name": "SLPL/naab", "description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade.", "downloads": 64, "configs": {"all": {"config_name": "all", "sample_row": "{\"text\": \"\\\" \\\\u062a\\\\u0648\\\\u06cc \\\\u0628\\\\u0633\\\\u0627\\\\u0637\\\\u063...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade.\n", "dataset_name": "SLPL/naab"}}, "tags": ["task_categories:fill-mask", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "multilinguality:monolingual", "language:fa"], "is_gated": false}, "jakartaresearch/inglish": {"dataset_name": "jakartaresearch/inglish", "description": "This dataset is built as a playground for beginner to make a translation model for Indonesian and English.", "downloads": 158, "configs": {"default": {"config_name": "default", "sample_row": "{\"english\": \"\\\"Amrozi accused his brother, whom he called \\\\\\\"the ...\", \"indonesian\": \"\\\"Amrozi menuduh saudaranya, yang dia sebut \\\\\\\"saksi...\"}", "columns": ["english", "indonesian"], "columns_mapping": {"english": "english", "indonesian": "indonesian"}, "dataset_description": "This dataset is built as a playground for beginner to make a translation model for Indonesian and English.\n", "dataset_name": "jakartaresearch/inglish"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "multilinguality:translation", "source_datasets:original", "language:id", "language:en", "indonesian", "english", "translation"], "is_gated": false}, "yhavinga/xsum_dutch": {"dataset_name": "yhavinga/xsum_dutch", "description": "Extreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "downloads": 111, "configs": {"1.0.0": {"config_name": "1.0.0", "sample_row": "{\"document\": \"\\\"De volledige kosten van de schade in Newton Stewa...\", \"summary\": \"\\\"Opruimingsoperaties worden voortgezet in de Schot...\", \"id\": \"\\\"35232142\\\"\"}", "columns": ["document", "summary", "id"], "columns_mapping": {"document": "document", "summary": "summary", "id": "id"}, "dataset_description": "\nExtreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.\n\n", "dataset_name": "yhavinga/xsum_dutch"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "language:nl"], "is_gated": false}, "RCC-MSU/collection3": {"dataset_name": "RCC-MSU/collection3", "description": "Collection3 is a Russian dataset for named entity recognition annotated with LOC (location), PER (person), and ORG (organization) tags.\n\nDataset is based on collection Persons-1000 originally containing 1000 news documents labeled only with names of persons.\nAdditional labels were added by Valerie Mozharova and Natalia Loukachevitch.\nConversion to the IOB2 format and splitting into train, validation and test sets was done by DeepPavlov team.\n\nFor more details see https://ieeexplore.ieee.org/document/7584769 and http://labinform.ru/pub/named_entities/index.htm", "downloads": 92, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0414\\\\u043e\\\\u043f\\\\u043e\\\\u043b\\\\u043d\\\\u0435\\\\u043d...\", \"ner_tags\": \"[0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Collection3 is a Russian dataset for named entity recognition annotated with LOC (location), PER (person), and ORG (organization) tags.\n\nDataset is based on collection Persons-1000 originally containing 1000 news documents labeled only with names of persons.\nAdditional labels were added by Valerie Mozharova and Natalia Loukachevitch.\nConversion to the IOB2 format and splitting into train, validation and test sets was done by DeepPavlov team.\n\nFor more details see https://ieeexplore.ieee.org/document/7584769 and http://labinform.ru/pub/named_entities/index.htm\n", "dataset_name": "RCC-MSU/collection3"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:other", "multilinguality:monolingual", "language:ru"], "is_gated": false}, "OxAISH-AL-LLM/wiki_toxic": {"dataset_name": "OxAISH-AL-LLM/wiki_toxic", "description": "Jigsaw Toxic Comment Challenge dataset. This dataset was the basis of a Kaggle competition run by Jigsaw", "downloads": 386, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"794c30aff0931384\\\"\", \"comment_text\": \"\\\"And that's not a personal attack^^ ?\\\"\", \"label\": \"0\"}", "columns": ["id", "comment_text", "label"], "columns_mapping": {"id": "id", "comment_text": "comment_text", "label": "label"}, "dataset_description": "Jigsaw Toxic Comment Challenge dataset. This dataset was the basis of a Kaggle competition run by Jigsaw\n", "dataset_name": "OxAISH-AL-LLM/wiki_toxic"}}, "tags": ["task_categories:text-classification", "task_ids:hate-speech-detection", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "wikipedia", "toxicity", "toxic comments"], "is_gated": false}, "BDas/ArabicNLPDataset": {"dataset_name": "BDas/ArabicNLPDataset", "description": "The dataset, prepared in Arabic, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 22, "configs": {"ArabicData": {"config_name": "ArabicData", "sample_row": "{\"text\": \"\\\"\\\\ufeff\\\\u062d\\\\u062f\\\\u064a\\\\u062f \\\\u062e\\\\u0641\\\\u064a...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in Arabic, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/ArabicNLPDataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "BDas/EnglishNLPDataset": {"dataset_name": "BDas/EnglishNLPDataset", "description": "The dataset, prepared in English, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 11, "configs": {"EnglishData": {"config_name": "EnglishData", "sample_row": "{\"text\": \"\\\"my fav\\\"\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in English, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/EnglishNLPDataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "alexandrainst/scandi-qa": {"dataset_name": "alexandrainst/scandi-qa", "description": "ScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.", "downloads": 13, "configs": {"da": {"config_name": "da", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Hvor stammer udr\\\\u00e5bet great scott fra?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott! er en indskydelse af overraskelse, f...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}, "sv": {"config_name": "sv", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Var kommer frasen great scott fr\\\\u00e5n?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott! \\\\u00e4r en interjektion av \\\\u00f6ver...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}, "no": {"config_name": "no", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Hvor kommer uttrykket great scott fra?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott\\\\nFlott Scott! er et innslag av overra...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "multilinguality:multilingual", "source_datasets:mkqa", "source_datasets:natural_questions", "language:da", "language:sv", "language:no"], "is_gated": false}, "opus/liv4ever": {"dataset_name": "opus/liv4ever", "description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.", "downloads": 152, "configs": {"en-liv": {"config_name": "en-liv", "sample_row": "{\"translation.en\": \"\\\"Best wishes to our dear colleague, researcher of ...\", \"translation.liv\": \"\\\"P\\\\u01dfgi\\\\u0146 v\\\\u022fnn\\\\u00f5 m\\\\u00e4d kol\\\\u011...\"}", "columns": ["translation_en", "translation_liv"], "columns_mapping": {"translation.en": "translation_en", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "et-liv": {"config_name": "et-liv", "sample_row": "{\"translation.et\": \"\\\"K\\\\u00e4ega lehvitab talle nagu n\\\\u00e4gemiseni.\\\"...\", \"translation.liv\": \"\\\"K\\\\u00e4dk\\\\u00f5ks v\\\\u0113tsi\\\\u0146\\\\u021b\\\\u00f5b t...\"}", "columns": ["translation_et", "translation_liv"], "columns_mapping": {"translation.et": "translation_et", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "fr-liv": {"config_name": "fr-liv", "sample_row": "{\"translation.fr\": \"\\\"\\\\u00c9cartez-vous, s\\\\u00e9parez-vous\\\"\", \"translation.liv\": \"\\\"Lagg\\\\u00f5g\\\\u00f5d r\\\\u016bimig\\\\u00f5d\\\"\"}", "columns": ["translation_fr", "translation_liv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "liv-lv": {"config_name": "liv-lv", "sample_row": "{\"translation.liv\": \"\\\"K\\\\u00e4dk\\\\u00f5ks v\\\\u0113tsi\\\\u0146\\\\u021b\\\\u00f5b t...\", \"translation.lv\": \"\\\"Ar roku m\\\\u0101j vi\\\\u0146am it k\\\\u0101 uz redz\\\\u0...\"}", "columns": ["translation_liv", "translation_lv"], "columns_mapping": {"translation.liv": "translation_liv", "translation.lv": "translation_lv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"As hydronyms are generally ancient, the names of ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "et": {"config_name": "et", "sample_row": "{\"text\": \"\\\"Kus sa l\\\\u00e4hed, marjaneitsi\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"\\\\u00c9cartez-vous, s\\\\u00e9parez-vous\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "liv": {"config_name": "liv", "sample_row": "{\"text\": \"\\\"Kus sa l\\\\u01dfd, M\\\\u014d\\\\u0157\\\\u00f5neitst\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "lv": {"config_name": "lv", "sample_row": "{\"text\": \"\\\"Valsts Prezidenta prieks\\\\u030cva\\\\u0304rds\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}}, "tags": [], "is_gated": false}, "bigbio/biosses": {"dataset_name": "bigbio/biosses", "description": "BIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.", "downloads": 51, "configs": {"biosses_source": {"config_name": "biosses_source", "sample_row": "{\"id\": \"0\", \"document_id\": \"1\", \"text_1\": \"\\\"It has recently been shown that Craf is essential...\", \"text_2\": \"\\\"It has recently become evident that Craf is essen...\", \"annotator_a\": \"4\", \"annotator_b\": \"4\", \"annotator_c\": \"4\", \"annotator_d\": \"4\", \"annotator_e\": \"4\"}", "columns": ["id", "document_id", "text_1", "text_2", "annotator_a", "annotator_b", "annotator_c", "annotator_d", "annotator_e"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "annotator_a": "annotator_a", "annotator_b": "annotator_b", "annotator_c": "annotator_c", "annotator_d": "annotator_d", "annotator_e": "annotator_e"}, "dataset_description": "\nBIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.\n", "dataset_name": "bigbio/biosses"}, "biosses_bigbio_pairs": {"config_name": "biosses_bigbio_pairs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"It has recently been shown that Craf is essential...\", \"text_2\": \"\\\"It has recently become evident that Craf is essen...\", \"label\": \"\\\"4.0\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "label"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "\nBIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.\n", "dataset_name": "bigbio/biosses"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "eraldoluis/faquad": {"dataset_name": "eraldoluis/faquad", "description": "Academic secretaries and faculty members of higher education institutions face a common problem: \n the abundance of questions sent by academics \n whose answers are found in available institutional documents. \nThe official documents produced by Brazilian public universities are vast and disperse, \n which discourage students to further search for answers in such sources.\nIn order to lessen this problem, we present FaQuAD: \n a novel machine reading comprehension dataset \n in the domain of Brazilian higher education institutions. \nFaQuAD follows the format of SQuAD (Stanford Question Answering Dataset) [Rajpurkar et al. 2016]. \nIt comprises 900 questions about 249 reading passages (paragraphs), \n which were taken from 18 official documents of a computer science college \n from a Brazilian federal university \n and 21 Wikipedia articles related to Brazilian higher education system. \nAs far as we know, this is the first Portuguese reading comprehension dataset in this format.", "downloads": 186, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"26f2ae969a804ba392e5bd0c62d58896\\\"\", \"title\": \"\\\"UFMS\\\"\", \"context\": \"\\\"Universidade Federal de Mato Grosso do Sul (UFMS)...\", \"question\": \"\\\"O que \\\\u00e9 a UFMS?\\\"\", \"answers.text\": \"[\\\"uma institui\\\\u00e7\\\\u00e3o de ensino superior p\\\\u...\", \"answers.answer_start\": \"[52, 52, 52]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Academic secretaries and faculty members of higher education institutions face a common problem: \n the abundance of questions sent by academics \n whose answers are found in available institutional documents. \nThe official documents produced by Brazilian public universities are vast and disperse, \n which discourage students to further search for answers in such sources.\nIn order to lessen this problem, we present FaQuAD: \n a novel machine reading comprehension dataset \n in the domain of Brazilian higher education institutions. \nFaQuAD follows the format of SQuAD (Stanford Question Answering Dataset) [Rajpurkar et al. 2016]. \nIt comprises 900 questions about 249 reading passages (paragraphs), \n which were taken from 18 official documents of a computer science college \n from a Brazilian federal university \n and 21 Wikipedia articles related to Brazilian higher education system. \nAs far as we know, this is the first Portuguese reading comprehension dataset in this format.\n", "dataset_name": "eraldoluis/faquad"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:pt"], "is_gated": false}, "neulab/conala": {"dataset_name": "neulab/conala", "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.", "downloads": 504, "configs": {"curated": {"config_name": "curated", "sample_row": "{\"question_id\": \"41067960\", \"intent\": \"\\\"How to convert a list of multiple integers into a...\", \"rewritten_intent\": \"\\\"Concatenate elements of a list 'x' of multiple in...\", \"snippet\": \"\\\"sum(d * 10 ** i for i, d in enumerate(x[::-1]))\\\"...\"}", "columns": ["question_id", "intent", "rewritten_intent", "snippet"], "columns_mapping": {"question_id": "question_id", "intent": "intent", "rewritten_intent": "rewritten_intent", "snippet": "snippet"}, "dataset_description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n", "dataset_name": "neulab/conala"}, "mined": {"config_name": "mined", "sample_row": "{\"question_id\": \"34705205\", \"parent_answer_post_id\": \"34705233\", \"prob\": \"0.8690001442846342\", \"snippet\": \"\\\"sorted(l, key=lambda x: (-int(x[1]), x[0]))\\\"\", \"intent\": \"\\\"Sort a nested list by two elements\\\"\", \"id\": \"\\\"34705205_34705233_0\\\"\"}", "columns": ["question_id", "parent_answer_post_id", "prob", "snippet", "intent", "id"], "columns_mapping": {"question_id": "question_id", "parent_answer_post_id": "parent_answer_post_id", "prob": "prob", "snippet": "snippet", "intent": "intent", "id": "id"}, "dataset_description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n", "dataset_name": "neulab/conala"}}, "tags": ["task_categories:text2text-generation", "multilinguality:monolingual", "source_datasets:original", "language:code", "code-generation"], "is_gated": false}, "codesue/kelly": {"dataset_name": "codesue/kelly", "description": "The Swedish Kelly list is a freely available frequency-based vocabulary list that comprises general-purpose language of modern Swedish. The list was generated from a large web-acquired corpus (SweWaC) of 114 million words dating from the 2010s. It is adapted to the needs of language learners and contains 8,425 most frequent lemmas that cover 80% of SweWaC.\\", "downloads": 12, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"1\", \"raw_frequency\": \"NaN\", \"relative_frequency\": \"NaN\", \"cefr_level\": \"\\\"A1\\\"\", \"source\": \"\\\"manual\\\"\", \"marker\": \"\\\"\\\"\", \"lemma\": \"\\\"andra\\\"\", \"pos\": \"\\\"numeral\\\"\", \"examples\": \"\\\"\\\"\"}", "columns": ["id", "raw_frequency", "relative_frequency", "cefr_level", "source", "marker", "lemma", "pos", "examples"], "columns_mapping": {"id": "id", "raw_frequency": "raw_frequency", "relative_frequency": "relative_frequency", "cefr_level": "cefr_level", "source": "source", "marker": "marker", "lemma": "lemma", "pos": "pos", "examples": "examples"}, "dataset_description": "The Swedish Kelly list is a freely available frequency-based vocabulary list that comprises general-purpose language of modern Swedish. The list was generated from a large web-acquired corpus (SweWaC) of 114 million words dating from the 2010s. It is adapted to the needs of language learners and contains 8,425 most frequent lemmas that cover 80% of SweWaC.", "dataset_name": "codesue/kelly"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:sv", "lexicon", "swedish", "CEFR"], "is_gated": false}, "PlanTL-GOB-ES/wnli-es": {"dataset_name": "PlanTL-GOB-ES/wnli-es", "description": "professional translation into Spanish of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "downloads": 52, "configs": {"winograd": {"config_name": "winograd", "sample_row": "{\"sentence1\": \"\\\"Clav\\\\u00e9 una aguja en una zanahoria. Cuando saq...\", \"sentence2\": \"\\\"La zanahoria ten\\\\u00eda un agujero.\\\"\", \"label\": \"1\"}", "columns": ["sentence1", "sentence2", "label"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\n professional translation into Spanish of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).\n ", "dataset_name": "PlanTL-GOB-ES/wnli-es"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|glue", "language:es"], "is_gated": false}, "bigbio/gad": {"dataset_name": "bigbio/gad", "description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database", "downloads": 154, "configs": {"gad_fold0_source": {"config_name": "gad_fold0_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"this study proposes that A/A genotype at position...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold1_source": {"config_name": "gad_fold1_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The @GENE$ Asp allele may be a genetic risk facto...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold2_source": {"config_name": "gad_fold2_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The @GENE$ gene is likely to be involved in the g...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold3_source": {"config_name": "gad_fold3_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"In conclusion, a significant association between ...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold4_source": {"config_name": "gad_fold4_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"An interaction with hypertension in the associati...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold5_source": {"config_name": "gad_fold5_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The polymorphism of @GENE$ promoter -969(G>C) is ...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold6_source": {"config_name": "gad_fold6_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"These results indicate that mutations in NLGN3 an...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold7_source": {"config_name": "gad_fold7_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"This study shows that the @GENE$ gene promoter po...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold8_source": {"config_name": "gad_fold8_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"Our findings suggest that the increased productio...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold9_source": {"config_name": "gad_fold9_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"Our results support that @GENE$ and CD-105 are cl...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold0_bigbio_text": {"config_name": "gad_fold0_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"this study proposes that A/A genotype at position...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold1_bigbio_text": {"config_name": "gad_fold1_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The @GENE$ Asp allele may be a genetic risk facto...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold2_bigbio_text": {"config_name": "gad_fold2_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The @GENE$ gene is likely to be involved in the g...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold3_bigbio_text": {"config_name": "gad_fold3_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"In conclusion, a significant association between ...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold4_bigbio_text": {"config_name": "gad_fold4_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"An interaction with hypertension in the associati...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold5_bigbio_text": {"config_name": "gad_fold5_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The polymorphism of @GENE$ promoter -969(G>C) is ...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold6_bigbio_text": {"config_name": "gad_fold6_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"These results indicate that mutations in NLGN3 an...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold7_bigbio_text": {"config_name": "gad_fold7_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"This study shows that the @GENE$ gene promoter po...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold8_bigbio_text": {"config_name": "gad_fold8_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"Our findings suggest that the increased productio...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold9_bigbio_text": {"config_name": "gad_fold9_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"Our results support that @GENE$ and CD-105 are cl...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_blurb_bigbio_text": {"config_name": "gad_blurb_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"this study proposes that A/A genotype at position...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}}, "tags": ["multilinguality:momolingual", "language:en"], "is_gated": false}, "bigbio/blurb": {"dataset_name": "bigbio/blurb", "description": "The BioCreative II Gene Mention task. The training corpus for the current task consists mainly of the training and testing corpora (text collections) from the BCI task, and the testing corpus for the current task consists of an additional 5,000 sentences that were held 'in reserve' from the previous task. In the current corpus, tokenization is not provided; instead participants are asked to identify a gene mention in a sentence by giving its start and end characters. As before, the training set consists of a set of sentences, and for each sentence a set of gene mentions (GENE annotations).\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Overview of BioCreative II gene mention recognition\n https://link.springer.com/article/10.1186/gb-2008-9-s2-s2", "downloads": 184, "configs": {"bc5chem": {"config_name": "bc5chem", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Selegiline\\\", \\\"-\\\", \\\"induced\\\", \\\"postural\\\", \\\"hypote...\", \"type\": \"\\\"chemical\\\"\", \"ner_tags\": \"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of three separate sets of articles with diseases, chemicals and their relations annotated. The training (500 articles) and development (500 articles) sets were released to task participants in advance to support text-mining method development. The test set (500 articles) was used for final system performance evaluation.\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-v-cdr-corpus\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: BioCreative V CDR task corpus: a resource for chemical disease relation extraction\n https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/\n", "dataset_name": "bigbio/blurb"}, "bc5disease": {"config_name": "bc5disease", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Selegiline\\\", \\\"-\\\", \\\"induced\\\", \\\"postural\\\", \\\"hypote...\", \"type\": \"\\\"disease\\\"\", \"ner_tags\": \"[0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of three separate sets of articles with diseases, chemicals and their relations annotated. The training (500 articles) and development (500 articles) sets were released to task participants in advance to support text-mining method development. The test set (500 articles) was used for final system performance evaluation.\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-v-cdr-corpus\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: BioCreative V CDR task corpus: a resource for chemical disease relation extraction\n https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/\n", "dataset_name": "bigbio/blurb"}, "bc2gm": {"config_name": "bc2gm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Immunohistochemical\\\", \\\"staining\\\", \\\"was\\\", \\\"positi...\", \"type\": \"\\\"gene\\\"\", \"ner_tags\": \"[0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The BioCreative II Gene Mention task. The training corpus for the current task consists mainly of the training and testing corpora (text collections) from the BCI task, and the testing corpus for the current task consists of an additional 5,000 sentences that were held 'in reserve' from the previous task. In the current corpus, tokenization is not provided; instead participants are asked to identify a gene mention in a sentence by giving its start and end characters. As before, the training set consists of a set of sentences, and for each sentence a set of gene mentions (GENE annotations).\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Overview of BioCreative II gene mention recognition\n https://link.springer.com/article/10.1186/gb-2008-9-s2-s2\n", "dataset_name": "bigbio/blurb"}, "jnlpba": {"config_name": "jnlpba", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"-DOCSTART-\\\"]\", \"type\": \"\\\"protein, DNA, RNA, cell line, or cell type\\\"\", \"ner_tags\": \"[0]\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The BioNLP / JNLPBA Shared Task 2004 involves the identification and classification of technical terms referring to concepts of interest to biologists in the domain of molecular biology. The task was organized by GENIA Project based on the annotations of the GENIA Term corpus (version 3.02).\n\n- Homepage: http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Introduction to the Bio-entity Recognition Task at JNLPBA\n https://aclanthology.org/W04-1213\n", "dataset_name": "bigbio/blurb"}, "ncbi_disease": {"config_name": "ncbi_disease", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Identification\\\", \\\"of\\\", \\\"APC2\\\", \\\",\\\", \\\"a\\\", \\\"homolo...\", \"type\": \"\\\"disease\\\"\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "[T]he NCBI disease corpus contains 6,892 disease mentions, which are mapped to 790 unique disease concepts. Of these, 88% link to a MeSH identifier, while the rest contain an OMIM identifier. We were able to link 91% of the mentions to a single disease concept, while the rest are described as a combination of concepts.\n\n- Homepage: https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: NCBI disease corpus: a resource for disease name recognition and concept normalization\n https://pubmed.ncbi.nlm.nih.gov/24393765/\n", "dataset_name": "bigbio/blurb"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "RussianNLP/tape": {"dataset_name": "RussianNLP/tape", "description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "downloads": 149, "configs": {"winograd.raw": {"config_name": "winograd.raw", "sample_row": "{\"text\": \"\\\"\\\\u041d\\\\u043e \\\\u043f\\\\u043e\\\\u0442\\\\u043e\\\\u043c \\\\u044...\", \"label\": \"0\", \"options\": \"[\\\"\\\\u043f\\\\u0435\\\\u0432\\\\u0438\\\\u0446\\\\u0430\\\", \\\"\\\\u0442\\\\u...\", \"reference\": \"\\\"\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u0430\\\\u044f\\\"\", \"homonymia_type\": \"1.1\", \"answer\": \"\\\"\\\\u0442\\\\u0443\\\\u0440\\\\u0446\\\\u0438\\\\u0438\\\"\"}", "columns": ["text", "label", "options", "reference", "homonymia_type", "answer"], "columns_mapping": {"text": "text", "label": "label", "options": "options", "reference": "reference", "homonymia_type": "homonymia_type", "answer": "answer"}, "dataset_description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "dataset_name": "RussianNLP/tape"}, "ru_openbook.raw": {"config_name": "ru_openbook.raw", "sample_row": "{\"ID\": \"\\\"7-980\\\"\", \"question\": \"\\\"\\\\u0421\\\\u043e\\\\u043b\\\\u043d\\\\u0446\\\\u0435 \\\\u043e\\\\u0442...\", \"answer\": \"\\\"D\\\"\"}", "columns": ["ID", "question", "answer"], "columns_mapping": {"ID": "ID", "question": "question", "answer": "answer"}, "dataset_description": "OpenBookQA for Russian is mainly based on the work of (Mihaylov et al., 2018):\nit is a QA dataset with multiple-choice elementary-level science questions, \nwhich probe the understanding of 1k+ core science facts. The dataset is mainly \ncomposed of automatic translation and human validation and correction. ", "dataset_name": "RussianNLP/tape"}, "ru_worldtree.raw": {"config_name": "ru_worldtree.raw", "sample_row": "{\"question\": \"\\\"\\\\u041d\\\\u0435\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u044b\\\\...\", \"exam_name\": \"\\\"MCAS\\\"\", \"school_grade\": \"\\\"5\\\"\", \"knowledge_type\": \"\\\"MODEL\\\"\", \"answer\": \"\\\"C\\\"\"}", "columns": ["question", "exam_name", "school_grade", "knowledge_type", "answer"], "columns_mapping": {"question": "question", "exam_name": "exam_name", "school_grade": "school_grade", "knowledge_type": "knowledge_type", "answer": "answer"}, "dataset_description": "The WorldTree task is very similar to the pipeline on the OpenBookQA, the main\ndifference being the additional lists of facts and the logical order that is \nattached to the output of each answer to a question (Jansen et al., 2018).", "dataset_name": "RussianNLP/tape"}, "multiq.raw": {"config_name": "multiq.raw", "sample_row": "{\"support_text\": \"\\\"\\\\u0414\\\\u0430\\\\u043d\\\\u0438\\\\u0435\\\\u043b (\\\\u0414\\\\u043...\", \"main_text\": \"\\\"\\\\u0427\\\\u0424\\\\u0420 \\\\u041a\\\\u043b\\\\u0443\\\\u0436 \\\\u201...\", \"question\": \"\\\"\\\\u0412 \\\\u043a\\\\u0430\\\\u043a\\\\u043e\\\\u0439 \\\\u043b\\\\u043...\", \"bridge_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 738, \\\"length\\\": 8, ...\", \"main_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 294, \\\"length\\\": 14,...\"}", "columns": ["support_text", "main_text", "question", "bridge_answers", "main_answers"], "columns_mapping": {"support_text": "support_text", "main_text": "main_text", "question": "question", "bridge_answers": "bridge_answers", "main_answers": "main_answers"}, "dataset_description": "Multi-hop reasoning has been the least addressed QA direction for Russian. We \nhave developed a semi-automatic pipeline for multi-hop dataset generation based \non Wikidata.\n\nFirst, we extract the triplets from Wikidata and search for their intersections. \nTwo triplets (subject, verb, object) are needed to compose an answerable multi-hop \nquestion. For instance, the question 'What continent is the country of which \nJohannes Block was a citizen?' is formed by a sequence of five graph units: 'Block, \nJohannes', 'citizenship', 'Germany', 'part of the world', 'Europe'. Second, several \nhundreds of the question templates are curated by a few authors manually, which are\nfurther used to fine-tune ruT5-largeto generate multi-hop questions given a \nfive-fold sequence. Third, the resulting questions undergo a paraphrasing and manual\nvalidation procedure to control the quality and diversity. Finally, each question is\nlinked to two Wikipedia paragraphs, where all graph units appear in the natural \nlanguage. The task is to select the answer span using information from both \nparagraphs.", "dataset_name": "RussianNLP/tape"}, "chegeka.raw": {"config_name": "chegeka.raw", "sample_row": "{\"question_id\": \"0\", \"question\": \"\\\"\\\\u0421\\\\u043a\\\\u0430\\\\u0436\\\\u0438\\\\u0442\\\\u0435 \\\\u043f...\", \"topic\": \"\\\"\\\\u0412 \\\\u041f\\\\u0415\\\\u0420\\\\u0415\\\\u0412\\\\u041e\\\\u0414...\", \"author\": \"\\\"\\\\u042e\\\\u0440\\\\u0438\\\\u0439 \\\\u0413\\\\u0440\\\\u0438\\\\u0448...\", \"tour_name\": \"\\\"\\\\\\\"\\\\u0421\\\\u0432\\\\u043e\\\\u044f \\\\u0438\\\\u0433\\\\u0440\\\\u04...\", \"tour_link\": \"\\\"https://db.chgk.info/tour/grishov\\\"\", \"answer\": \"\\\"\\\\u0422\\\\u0430\\\\u043d\\\\u043a\\\\u0430\\\"\"}", "columns": ["question_id", "question", "topic", "author", "tour_name", "tour_link", "answer"], "columns_mapping": {"question_id": "question_id", "question": "question", "topic": "topic", "author": "author", "tour_name": "tour_name", "tour_link": "tour_link", "answer": "answer"}, "dataset_description": "The CheGeKa game setup is similar to Jeopardy. The player should come up with \nthe answer to the question basing on wit, commonsense and deep knowledge. \nThe task format is QA with a free response form and is based on the reviewed \nunpublished data subsets by (Mikhalkova, 2021).", "dataset_name": "RussianNLP/tape"}, "sit_ethics.raw": {"config_name": "sit_ethics.raw", "sample_row": "{\"source\": \"\\\"lenta\\\"\", \"text\": \"\\\"\\\\u0420\\\\u043e\\\\u0441\\\\u0441\\\\u0438\\\\u044f\\\\u043d\\\\u0435 ...\", \"sit_virtue\": \"0\", \"sit_moral\": \"1\", \"sit_law\": \"0\", \"sit_justice\": \"0\", \"sit_util\": \"0\"}", "columns": ["source", "text", "sit_virtue", "sit_moral", "sit_law", "sit_justice", "sit_util"], "columns_mapping": {"source": "source", "text": "text", "sit_virtue": "sit_virtue", "sit_moral": "sit_moral", "sit_law": "sit_law", "sit_justice": "sit_justice", "sit_util": "sit_util"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to identify the \npresence of concepts in normative ethics, such as virtue, law, moral, justice, and \nutilitarianism.", "dataset_name": "RussianNLP/tape"}, "per_ethics.raw": {"config_name": "per_ethics.raw", "sample_row": "{\"source\": \"\\\"lenta\\\"\", \"text\": \"\\\"\\\\u0416\\\\u0443\\\\u0440\\\\u043d\\\\u0430\\\\u043b\\\\u0438\\\\u0441\\\\...\", \"per_virtue\": \"1\", \"per_moral\": \"0\", \"per_law\": \"1\", \"per_justice\": \"1\", \"per_util\": \"0\"}", "columns": ["source", "text", "per_virtue", "per_moral", "per_law", "per_justice", "per_util"], "columns_mapping": {"source": "source", "text": "text", "per_virtue": "per_virtue", "per_moral": "per_moral", "per_law": "per_law", "per_justice": "per_justice", "per_util": "per_util"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to evaluate the \npositive or negative implementation of five concepts in normative ethics (virtue, \nlaw, moral, justice, and utilitarianism) with 'yes' and 'no' ratings.", "dataset_name": "RussianNLP/tape"}, "winograd.episodes": {"config_name": "winograd.episodes", "sample_row": "{\"text\": \"\\\"\\\\u041d\\\\u0435 \\\\u043c\\\\u0435\\\\u043d\\\\u0435\\\\u0435 \\\\u043...\", \"label\": \"1\", \"options\": \"[\\\"\\\\u043f\\\\u0430\\\\u043b\\\\u044c\\\\u043c\\\\u0430\\\", \\\"\\\\u0410\\\\u...\", \"reference\": \"\\\"\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u0430\\\\u044f\\\"\", \"homonymia_type\": \"1.1\", \"answer\": \"\\\"\\\\u043f\\\\u0430\\\\u043b\\\\u044c\\\\u043c\\\\u0430\\\"\", \"perturbation\": \"\\\"winograd\\\"\", \"episode\": \"[15]\"}", "columns": ["text", "label", "options", "reference", "homonymia_type", "answer", "perturbation", "episode"], "columns_mapping": {"text": "text", "label": "label", "options": "options", "reference": "reference", "homonymia_type": "homonymia_type", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "dataset_name": "RussianNLP/tape"}, "ru_openbook.episodes": {"config_name": "ru_openbook.episodes", "sample_row": "{\"ID\": \"\\\"7-674\\\"\", \"question\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0436\\\\u0438\\\\u0432\\\\u043e...\", \"answer\": \"\\\"A\\\"\", \"perturbation\": \"\\\"ru_openbook\\\"\", \"episode\": \"[11]\"}", "columns": ["ID", "question", "answer", "perturbation", "episode"], "columns_mapping": {"ID": "ID", "question": "question", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "OpenBookQA for Russian is mainly based on the work of (Mihaylov et al., 2018):\nit is a QA dataset with multiple-choice elementary-level science questions, \nwhich probe the understanding of 1k+ core science facts. The dataset is mainly \ncomposed of automatic translation and human validation and correction. ", "dataset_name": "RussianNLP/tape"}, "ru_worldtree.episodes": {"config_name": "ru_worldtree.episodes", "sample_row": "{\"question\": \"\\\"\\\\u0422\\\\u0443\\\\u043d\\\\u0435\\\\u0446 - \\\\u044d\\\\u0442\\\\u04...\", \"exam_name\": \"\\\"MCAS\\\"\", \"school_grade\": \"\\\"5\\\"\", \"knowledge_type\": \"\\\"CAUSAL,MODEL\\\"\", \"answer\": \"\\\"A\\\"\", \"perturbation\": \"\\\"ru_worldtree\\\"\", \"episode\": \"[10, 11]\"}", "columns": ["question", "exam_name", "school_grade", "knowledge_type", "answer", "perturbation", "episode"], "columns_mapping": {"question": "question", "exam_name": "exam_name", "school_grade": "school_grade", "knowledge_type": "knowledge_type", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The WorldTree task is very similar to the pipeline on the OpenBookQA, the main\ndifference being the additional lists of facts and the logical order that is \nattached to the output of each answer to a question (Jansen et al., 2018).", "dataset_name": "RussianNLP/tape"}, "multiq.episodes": {"config_name": "multiq.episodes", "sample_row": "{\"support_text\": \"\\\"\\\\u041f\\\\u0430\\\\u0431\\\\u043b\\\\u043e \\\\u0410\\\\u043d\\\\u0434...\", \"main_text\": \"\\\"'\\\\u0411\\\\u0430\\\\u043d\\\\u0444\\\\u0438\\\\u043b\\\\u0434' (\\\\u0...\", \"question\": \"\\\"\\\\u0412 \\\\u043a\\\\u0430\\\\u043a\\\\u043e\\\\u0439 \\\\u043b\\\\u043...\", \"bridge_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 528, \\\"length\\\": 8, ...\", \"main_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 350, \\\"length\\\": 16,...\", \"perturbation\": \"\\\"multiq\\\"\", \"episode\": \"[18]\"}", "columns": ["support_text", "main_text", "question", "bridge_answers", "main_answers", "perturbation", "episode"], "columns_mapping": {"support_text": "support_text", "main_text": "main_text", "question": "question", "bridge_answers": "bridge_answers", "main_answers": "main_answers", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "Multi-hop reasoning has been the least addressed QA direction for Russian. We \nhave developed a semi-automatic pipeline for multi-hop dataset generation based \non Wikidata.\n\nFirst, we extract the triplets from Wikidata and search for their intersections. \nTwo triplets (subject, verb, object) are needed to compose an answerable multi-hop \nquestion. For instance, the question 'What continent is the country of which \nJohannes Block was a citizen?' is formed by a sequence of five graph units: 'Block, \nJohannes', 'citizenship', 'Germany', 'part of the world', 'Europe'. Second, several \nhundreds of the question templates are curated by a few authors manually, which are\nfurther used to fine-tune ruT5-largeto generate multi-hop questions given a \nfive-fold sequence. Third, the resulting questions undergo a paraphrasing and manual\nvalidation procedure to control the quality and diversity. Finally, each question is\nlinked to two Wikipedia paragraphs, where all graph units appear in the natural \nlanguage. The task is to select the answer span using information from both \nparagraphs.", "dataset_name": "RussianNLP/tape"}, "chegeka.episodes": {"config_name": "chegeka.episodes", "sample_row": "{\"question_id\": \"966\", \"question\": \"\\\"\\\\\\\"\\\\u041a\\\\u0430\\\\u0436\\\\u0434\\\\u0443\\\\u044e \\\\u043d\\\\u04...\", \"topic\": \"\\\"\\\\u041f\\\\u0435\\\\u0441\\\\u043d\\\\u0438-25\\\"\", \"author\": \"\\\"\\\\u0414\\\\u043c\\\\u0438\\\\u0442\\\\u0440\\\\u0438\\\\u0439 \\\\u0411...\", \"tour_name\": \"\\\"\\\\\\\"\\\\u0421\\\\u0432\\\\u043e\\\\u044f \\\\u0438\\\\u0433\\\\u0440\\\\u04...\", \"tour_link\": \"\\\"https://db.chgk.info/tour/spbrock\\\"\", \"answer\": \"\\\"\\\\u041e\\\\u043a\\\\u043d\\\\u0430\\\"\", \"perturbation\": \"\\\"chegeka\\\"\", \"episode\": \"[13, 18]\"}", "columns": ["question_id", "question", "topic", "author", "tour_name", "tour_link", "answer", "perturbation", "episode"], "columns_mapping": {"question_id": "question_id", "question": "question", "topic": "topic", "author": "author", "tour_name": "tour_name", "tour_link": "tour_link", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The CheGeKa game setup is similar to Jeopardy. The player should come up with \nthe answer to the question basing on wit, commonsense and deep knowledge. \nThe task format is QA with a free response form and is based on the reviewed \nunpublished data subsets by (Mikhalkova, 2021).", "dataset_name": "RussianNLP/tape"}, "sit_ethics.episodes": {"config_name": "sit_ethics.episodes", "sample_row": "{\"source\": \"\\\"gazeta\\\"\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0441-\\\\u043d\\\\u0430\\\\u0441\\\\u0442\\\\u0430...\", \"sit_virtue\": \"0\", \"sit_moral\": \"0\", \"sit_law\": \"0\", \"sit_justice\": \"0\", \"sit_util\": \"0\", \"perturbation\": \"\\\"sit_ethics\\\"\", \"episode\": \"[5]\"}", "columns": ["source", "text", "sit_virtue", "sit_moral", "sit_law", "sit_justice", "sit_util", "perturbation", "episode"], "columns_mapping": {"source": "source", "text": "text", "sit_virtue": "sit_virtue", "sit_moral": "sit_moral", "sit_law": "sit_law", "sit_justice": "sit_justice", "sit_util": "sit_util", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to identify the \npresence of concepts in normative ethics, such as virtue, law, moral, justice, and \nutilitarianism.", "dataset_name": "RussianNLP/tape"}, "per_ethics.episodes": {"config_name": "per_ethics.episodes", "sample_row": "{\"source\": \"\\\"interfax\\\"\", \"text\": \"\\\"\\\\u0412\\\\u0430\\\\u0448\\\\u0438\\\\u043d\\\\u0433\\\\u0442\\\\u043e\\\\...\", \"per_virtue\": \"1\", \"per_moral\": \"0\", \"per_law\": \"1\", \"per_justice\": \"1\", \"per_util\": \"0\", \"perturbation\": \"\\\"per_ethics\\\"\", \"episode\": \"[5]\"}", "columns": ["source", "text", "per_virtue", "per_moral", "per_law", "per_justice", "per_util", "perturbation", "episode"], "columns_mapping": {"source": "source", "text": "text", "per_virtue": "per_virtue", "per_moral": "per_moral", "per_law": "per_law", "per_justice": "per_justice", "per_util": "per_util", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to evaluate the \npositive or negative implementation of five concepts in normative ethics (virtue, \nlaw, moral, justice, and utilitarianism) with 'yes' and 'no' ratings.", "dataset_name": "RussianNLP/tape"}}, "tags": ["task_categories:text-classification", "task_categories:question-answering", "task_categories:multiple-choice", "language:ru", "benchmark", "ethics", "question-answering", "reasoning"], "is_gated": false}, "csebuetnlp/BanglaParaphrase": {"dataset_name": "csebuetnlp/BanglaParaphrase", "description": "We present a high quality bangla paraphrase dataset containing about 466k paraphrase pairs. The paraphrases ensures high quality by being semantically coherent and syntactically diverse.", "downloads": 14, "configs": {"bn": {"config_name": "bn", "sample_row": "{\"source\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09ae\\\\u09be\\\\u09a8\\\\u099f\\\\u09bf \\\\u09af...\", \"target\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09ae\\\\u09be\\\\u09a8\\\\u099f\\\\u09be \\\\u098f...\"}", "columns": ["source", "target"], "columns_mapping": {"source": "source", "target": "target"}, "dataset_description": "We present a high quality bangla paraphrase dataset containing about 466k paraphrase pairs. The paraphrases ensures high quality by being semantically coherent and syntactically diverse.\n\n", "dataset_name": "csebuetnlp/BanglaParaphrase"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:bn", "conditional-text-generation", "paraphrase-generation"], "is_gated": false}, "elenanereiss/german-ler": {"dataset_name": "elenanereiss/german-ler", "description": "A dataset of Legal Documents from German federal court decisions for Named Entity Recognition. The dataset is human-annotated with 19 fine-grained entity classes. The dataset consists of approx. 67,000 sentences and contains 54,000 annotated entities.", "downloads": 61, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"dd\\\", \\\")\\\", \\\"Art.\\\", \\\"33\\\", \\\"Abs.\\\", \\\"5\\\", \\\"GG\\\", \\\"w\\\\u0...\", \"ner_tags\": \"[38, 38, 3, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38...\", \"ner_coarse_tags\": \"[14, 14, 2, 9, 9, 9, 9, 14, 14, 14, 14, 14, 14, 14...\"}", "columns": ["id", "tokens", "ner_tags", "ner_coarse_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_coarse_tags": "ner_coarse_tags"}, "dataset_description": "A dataset of Legal Documents from German federal court decisions for Named Entity Recognition. The dataset is human-annotated with 19 fine-grained entity classes. The dataset consists of approx. 67,000 sentences and contains 54,000 annotated entities.\n", "dataset_name": "elenanereiss/german-ler"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:de", "ner, named entity recognition, legal ner, legal texts, label classification", "doi:10.57967/hf/0046"], "is_gated": false}, "taln-ls2n/kpbiomed": {"dataset_name": "taln-ls2n/kpbiomed", "description": "KPBiomed benchmark dataset for keyphrase extraction an generation.", "downloads": 18, "configs": {"large": {"config_name": "large", "sample_row": "{\"id\": \"\\\"31703611\\\"\", \"title\": \"\\\"Recommendations for performance optimizations whe...\", \"abstract\": \"\\\"BACKGROUND\\\\nUse of the Genome Analysis Toolkit (G...\", \"authors\": \"\\\"['Heldenbrand|Jacob R|JR|', 'Baheti|Saurabh|S|', ...\", \"mesh_terms\": \"[\\\"D000465:Algorithms\\\", \\\"D002877:Chromosomes, Human...\", \"year\": \"\\\"2019\\\"\", \"keyphrases\": \"[\\\"GATK\\\", \\\"Genomic variant calling\\\", \\\"Computational...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}, "medium": {"config_name": "medium", "sample_row": "{\"id\": \"\\\"28495559\\\"\", \"title\": \"\\\"Influence of adhesive strategy on clinical parame...\", \"abstract\": \"\\\"OBJECTIVES\\\\nWe aimed to answer the following PICO...\", \"authors\": \"\\\"['Schroeder|Marcos|M|', 'Correa|Ivo Carlos|IC|', ...\", \"mesh_terms\": \"[\\\"D000134:Acid Etching, Dental\\\", \\\"D003188:Composit...\", \"year\": \"\\\"2017\\\"\", \"keyphrases\": \"[\\\"Systematic review\\\", \\\"Postoperative sensitivity\\\",...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}, "small": {"config_name": "small", "sample_row": "{\"id\": \"\\\"32305274\\\"\", \"title\": \"\\\"A novel tube technique enables visualization of t...\", \"abstract\": \"\\\"BACKGROUND\\\\nPercutaneous pedicle screws(PPS) have...\", \"authors\": \"\\\"['Li|Xu|X|', 'Zhang|Rui|R|', 'Chen|Buzhou|B|', 'D...\", \"mesh_terms\": \"[\\\"D005471:Fluoroscopy\\\", \\\"D050723:Fractures, Bone\\\",...\", \"year\": \"\\\"2020\\\"\", \"keyphrases\": \"[\\\"Jamshidi needles\\\", \\\"Radiation exposure\\\", \\\"15\\\\u00...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"M\\\", \\\"R\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "AmazonScience/mintaka": {"dataset_name": "AmazonScience/mintaka", "description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers.", "downloads": 94, "configs": {"en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"en\\\"\", \"question\": \"\\\"What is the seventh tallest mountain in North Ame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "ar": {"config_name": "ar", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"ar\\\"\", \"question\": \"\\\"\\\\u0645\\\\u0627 \\\\u0633\\\\u0627\\\\u0628\\\\u0639 \\\\u0623\\\\u063...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": null}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"de\\\"\", \"question\": \"\\\"Wie hei\\\\u00dft der siebth\\\\u00f6chste Berg Nordame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"ja\\\"\", \"question\": \"\\\"\\\\u5317\\\\u30a2\\\\u30e1\\\\u30ea\\\\u30ab\\\\u3067\\\\u4e03\\\\u756a\\\\...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"\\\\u30eb\\\\u30ab\\\\u30cb...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"hi\\\"\", \"question\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u0924\\\\u0930 \\\\u0905\\\\u092e\\\\u0947...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": null}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "pt": {"config_name": "pt", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"pt\\\"\", \"question\": \"\\\"Qual \\\\u00e9 a s\\\\u00e9tima montanha mais alta da A...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"es\\\"\", \"question\": \"\\\"\\\\u00bfCu\\\\u00e1l es la s\\\\u00e9ptima monta\\\\u00f1a m...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "it": {"config_name": "it", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"it\\\"\", \"question\": \"\\\"Qual \\\\u00e8 la settima montagna pi\\\\u00f9 alta del...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"fr\\\"\", \"question\": \"\\\"Quelle est la septi\\\\u00e8me plus haute montagne d...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"mont Lucania\\\"}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"en\\\"\", \"question\": \"\\\"What is the seventh tallest mountain in North Ame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:ar", "multilinguality:de", "multilinguality:ja", "multilinguality:hi", "multilinguality:pt", "multilinguality:en", "multilinguality:es", "multilinguality:it", "multilinguality:fr", "source_datasets:original"], "is_gated": false}, "GEM/TaTA": {"dataset_name": "GEM/TaTA", "description": "Dataset loader for TaTA: A Multilingual Table-to-Text Dataset for African Languages", "downloads": 90, "configs": {"default": {"config_name": "default", "sample_row": "{\"gem_id\": \"\\\"AB20-ar-1\\\"\", \"example_id\": \"\\\"AB20-ar-1\\\"\", \"title\": \"\\\"\\\\u062a\\\\u0645\\\\u0643\\\\u064a\\\\u0646 \\\\u0627\\\\u0644\\\\u0634...\", \"unit_of_measure\": \"\\\"\\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0628\\\\u0629 \\\\u0627\\\\u0644...\", \"chart_type\": \"\\\"Horizontal Bar Chart\\\"\", \"was_translated\": \"\\\"True\\\"\", \"table_data\": \"\\\"[[\\\\\\\"\\\\\\\", \\\\\\\"\\\\\\\\u0645\\\\\\\\u0633\\\\\\\\u062a\\\\\\\\u0648\\\\\\\\u0649 \\\\\\\\u...\", \"linearized_input\": \"\\\"\\\\u062a\\\\u0645\\\\u0643\\\\u064a\\\\u0646 \\\\u0627\\\\u0644\\\\u0634...\", \"table_text\": \"[\\\"\\\\u062a\\\\u062a\\\\u0645\\\\u062a\\\\u0639 13% \\\\u0645\\\\u0646 ...\", \"target\": \"\\\"\\\\u062a\\\\u062a\\\\u0645\\\\u062a\\\\u0639 13% \\\\u0645\\\\u0646 \\\\...\"}", "columns": ["gem_id", "example_id", "title", "unit_of_measure", "chart_type", "was_translated", "table_data", "linearized_input", "table_text", "target"], "columns_mapping": {"gem_id": "gem_id", "example_id": "example_id", "title": "title", "unit_of_measure": "unit_of_measure", "chart_type": "chart_type", "was_translated": "was_translated", "table_data": "table_data", "linearized_input": "linearized_input", "table_text": "table_text", "target": "target"}, "dataset_description": "Dataset loader for TaTA: A Multilingual Table-to-Text Dataset for African Languages\n", "dataset_name": "GEM/TaTA"}}, "tags": ["task_categories:table-to-text", "annotations_creators:none", "multilinguality:yes", "source_datasets:original", "language:ar", "language:en", "language:fr", "language:ha", "language:ig", "language:pt", "language:ru", "language:sw", "language:yo", "data-to-text"], "is_gated": false}, "allenai/csabstruct": {"dataset_name": "allenai/csabstruct", "description": "As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.", "downloads": 79, "configs": {"CSAbstruct": {"config_name": "CSAbstruct", "sample_row": "{\"abstract_id\": \"\\\"train_0000\\\"\", \"sentences\": \"[\\\"Gamification has the potential to improve the qu...\", \"labels\": \"[0, 2, 1, 1, 4, 4, 4, 4, 4]\", \"confs\": \"[0.7778, 0.7778, 0.7778, 1.0, 0.6111, 0.5556, 0.61...\"}", "columns": ["abstract_id", "sentences", "labels", "confs"], "columns_mapping": {"abstract_id": "abstract_id", "sentences": "sentences", "labels": "labels", "confs": "confs"}, "dataset_description": "As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.\n", "dataset_name": "allenai/csabstruct"}}, "tags": [], "is_gated": false}, "sileod/probability_words_nli": {"dataset_name": "sileod/probability_words_nli", "description": "Probing neural language models for understanding of words of estimative probability", "downloads": 30, "configs": {"reasoning_1hop": {"config_name": "reasoning_1hop", "sample_row": "{\"context\": \"\\\"It is probably not the case that Mary is in the s...\", \"hypothesis\": \"\\\"We believe that 'Julius is a frog' or 'Mary is in...\", \"valid_hypothesis\": \"\\\"It is probably not the case that 'Julius is a fro...\", \"invalid_hypothesis\": \"\\\"We believe that 'Julius is a frog' or 'Mary is in...\", \"problog\": \"\\\"\\\\n and(A,B) :- A,B.\\\\n or(A,B) :- A;B.\\\\n nand(A...\", \"probability_word\": \"\\\"probably not\\\"\", \"distractor\": \"\\\"we believe\\\"\", \"hypothesis_assertion\": \"\\\"'Julius is a frog' or 'Mary is in the school' or ...\", \"label\": \"0\", \"idx\": \"0\", \"probability\": \"0.325\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "problog", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "problog": "problog", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}, "reasoning_2hop": {"config_name": "reasoning_2hop", "sample_row": "{\"context\": \"\\\"There is almost no chance that Greg is gray. Chan...\", \"hypothesis\": \"\\\"It is highly likely that either 'John discarded t...\", \"valid_hypothesis\": \"\\\"It is unlikely that either 'John discarded the ap...\", \"invalid_hypothesis\": \"\\\"It is highly likely that either 'John discarded t...\", \"problog\": \"\\\"\\\\n and(A,B) :- A,B.\\\\n or(A,B) :- A;B.\\\\n nand(A...\", \"probability_word\": \"\\\"unlikely\\\"\", \"distractor\": \"\\\"highly likely\\\"\", \"hypothesis_assertion\": \"\\\"Either 'John discarded the apple' or 'Sandra got ...\", \"label\": \"0\", \"idx\": \"0\", \"probability\": \"0.18\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "problog", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "problog": "problog", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}, "usnli": {"config_name": "usnli", "sample_row": "{\"context\": \"\\\"Woman in white in foreground and a man slightly b...\", \"hypothesis\": \"\\\"We believe that they are working for John 's Pizz...\", \"valid_hypothesis\": \"\\\"We believe that they are working for John 's Pizz...\", \"invalid_hypothesis\": \"\\\"It is improbable that they are working for John '...\", \"probability_word\": \"\\\"we believe\\\"\", \"distractor\": \"\\\"improbable\\\"\", \"hypothesis_assertion\": \"\\\"They are working for John 's Pizza .\\\"\", \"label\": \"1\", \"idx\": \"0\", \"probability\": \"0.7445574122575764\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}}, "tags": ["task_categories:text-classification", "task_categories:multiple-choice", "task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:multiple-choice-qa", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "wep", "words of estimative probability", "probability", "logical reasoning", "soft logic", "nli", "verbal probabilities", "natural-language-inference", "reasoning", "logic"], "is_gated": false}, "lmqg/qa_squadshifts": {"dataset_name": "lmqg/qa_squadshifts", "description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "downloads": 388, "configs": {"all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Gas and electric service is provided by Consolida...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"answers.text\": \"[\\\"Edison Electric Illuminating Company\\\"]\", \"answers.answer_start\": \"[153]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "amazon": {"config_name": "amazon", "sample_row": "{\"id\": \"\\\"5dd4d824cc027a086d65fde6\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"question\": \"\\\"How many people does the reviewer suggest it take...\", \"answers.text\": \"[\\\"It is a one-person job\\\"]\", \"answers.answer_start\": \"[143]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "new_wiki": {"config_name": "new_wiki", "sample_row": "{\"id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Gas and electric service is provided by Consolida...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"answers.text\": \"[\\\"Edison Electric Illuminating Company\\\"]\", \"answers.answer_start\": \"[153]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "nyt": {"config_name": "nyt", "sample_row": "{\"id\": \"\\\"5d704c4ac8e4820a9b66e9f7\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"question\": \"\\\"Ms. Clyne used facsimiles of what possession of E...\", \"answers.text\": \"[\\\"letters\\\"]\", \"answers.answer_start\": \"[214]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "reddit": {"config_name": "reddit", "sample_row": "{\"id\": \"\\\"5d9c25298ae5305bc982eff7\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"question\": \"\\\"What is the author's main reason for wanting to h...\", \"answers.text\": \"[\\\"pokegenning/romhacking\\\"]\", \"answers.answer_start\": \"[468]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:en"], "is_gated": false}, "Conrad747/lg-ner": {"dataset_name": "Conrad747/lg-ner", "description": "LugandaPII is a named entity dataset consisting of PERSON, ORG, LOCATION, NORP, USERID and DATE entities.\nThe train/validation/test sets are available for the Luganda language.", "downloads": 15, "configs": {"lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"bassentebe\\\", \\\"be\\\", \\\"##byalo\\\", \\\"balabuddwa\\\", \\\"oku...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "LugandaPII is a named entity dataset consisting of PERSON, ORG, LOCATION, NORP, USERID and DATE entities.\nThe train/validation/test sets are available for the Luganda language.\n", "dataset_name": "Conrad747/lg-ner"}}, "tags": [], "is_gated": false}, "lmqg/qag_tweetqa": {"dataset_name": "lmqg/qag_tweetqa", "description": "Question & answer generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "downloads": 11, "configs": {"qag_tweetqa": {"config_name": "qag_tweetqa", "sample_row": "{\"answers\": \"[\\\"editor\\\", \\\"1991\\\", \\\"ben bradlee\\\", \\\"1994\\\"]\", \"questions\": \"[\\\"what did bradlee retire as?\\\", \\\"when did ben brad...\", \"paragraph\": \"\\\"\\\\\\\"So much of The Post is Ben,\\\\\\\" Mrs. Graham said ...\", \"paragraph_id\": \"\\\"78ac37b757cc7863a0bc39a34e8abe72-50539ee37b16f348...\", \"questions_answers\": \"\\\"question: what did bradlee retire as?, answer: ed...\"}", "columns": ["answers", "questions", "paragraph", "paragraph_id", "questions_answers"], "columns_mapping": {"answers": "answers", "questions": "questions", "paragraph": "paragraph", "paragraph_id": "paragraph_id", "questions_answers": "questions_answers"}, "dataset_description": "Question & answer generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "dataset_name": "lmqg/qag_tweetqa"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:tweet_qa", "language:en", "question-generation"], "is_gated": false}, "lmqg/qag_squad": {"dataset_name": "lmqg/qag_squad", "description": "Question & answer generation dataset based on SQuAD.", "downloads": 13, "configs": {"qag_squad": {"config_name": "qag_squad", "sample_row": "{\"answers\": \"[\\\"4 Minutes\\\", \\\"Elvis Presley\\\", \\\"thirteenth\\\", \\\"Stic...\", \"questions\": \"[\\\"Which single was released as the album's lead si...\", \"paragraph\": \"\\\"\\\\\\\"4 Minutes\\\\\\\" was released as the album's lead si...\", \"questions_answers\": \"\\\"question: Which single was released as the album'...\"}", "columns": ["answers", "questions", "paragraph", "questions_answers"], "columns_mapping": {"answers": "answers", "questions": "questions", "paragraph": "paragraph", "questions_answers": "questions_answers"}, "dataset_description": "Question & answer generation dataset based on SQuAD.", "dataset_name": "lmqg/qag_squad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:lmqg/qg_squad", "language:en", "question-generation"], "is_gated": false}, "lawcompany/KLAID": {"dataset_name": "lawcompany/KLAID", "description": "KLAID (Korean Legal Artificial Intelligence Datasets) is a dataset for the development of Korean legal artificial intelligence technology. This time we offer 1 task, which is legal judgment prediction(LJP).", "downloads": 33, "configs": {"ljp": {"config_name": "ljp", "sample_row": "{\"laws_service_id\": \"32\", \"fact\": \"\\\"\\\\ud53c\\\\uace0\\\\uc778\\\\uc740 2018. 8. 9. 23:33\\\\uacbd ...\", \"laws_service\": \"\\\"\\\\ub3c4\\\\ub85c\\\\uad50\\\\ud1b5\\\\ubc95 \\\\uc81c148\\\\uc870\\\\uc...\"}", "columns": ["laws_service_id", "fact", "laws_service"], "columns_mapping": {"laws_service_id": "laws_service_id", "fact": "fact", "laws_service": "laws_service"}, "dataset_description": "KLAID (Korean Legal Artificial Intelligence Datasets) is a dataset for the development of Korean legal artificial intelligence technology. This time we offer 1 task, which is legal judgment prediction(LJP).\n", "dataset_name": "lawcompany/KLAID"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "multilinguality:monolingual", "language:ko"], "is_gated": false}, "bigbio/bc7_litcovid": {"dataset_name": "bigbio/bc7_litcovid", "description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.", "downloads": 103, "configs": {"bc7_litcovid_source": {"config_name": "bc7_litcovid_source", "sample_row": "{\"pmid\": \"\\\"32519164\\\"\", \"journal\": \"\\\"J Thromb Thrombolysis\\\"\", \"title\": \"\\\"Potential role for tissue factor in the pathogene...\", \"abstract\": \"\\\"In December 2019, a new and highly contagious inf...\", \"keywords\": \"[\\\"covid-19\\\", \\\"il-6\\\", \\\"sars-cov-2\\\", \\\"tnf-alpha\\\", \\\"t...\", \"pub_type\": \"[\\\"Journal Article\\\", \\\"Review\\\"]\", \"authors\": \"[\\\"Bautista-Vargas, Mario\\\", \\\"Bonilla-Abadia, Fabio\\\"...\", \"doi\": \"\\\"10.1007/s11239-020-02172-x\\\"\", \"labels\": \"[1, 3]\"}", "columns": ["pmid", "journal", "title", "abstract", "keywords", "pub_type", "authors", "doi", "labels"], "columns_mapping": {"pmid": "pmid", "journal": "journal", "title": "title", "abstract": "abstract", "keywords": "keywords", "pub_type": "pub_type", "authors": "authors", "doi": "doi", "labels": "labels"}, "dataset_description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.\n", "dataset_name": "bigbio/bc7_litcovid"}, "bc7_litcovid_bigbio_text": {"config_name": "bc7_litcovid_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"32519164\\\"\", \"text\": \"\\\"In December 2019, a new and highly contagious inf...\", \"labels\": \"[\\\"Treatment\\\", \\\"Mechanism\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.\n", "dataset_name": "bigbio/bc7_litcovid"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bioinfer": {"dataset_name": "bigbio/bioinfer", "description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.", "downloads": 49, "configs": {"bioinfer_source": {"config_name": "bioinfer_source", "sample_row": "{\"document_id\": \"\\\"BioInfer.d0.s0\\\"\", \"type\": \"\\\"Sentence\\\"\", \"text\": \"\\\"alpha-catenin inhibits beta-catenin signaling by ...\", \"entities\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.e0\\\", \\\"offsets\\\": [[88, 101]...\", \"relations\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_...\"}", "columns": ["document_id", "type", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "type": "type", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.\n", "dataset_name": "bigbio/bioinfer"}, "bioinfer_bigbio_kb": {"config_name": "bioinfer_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BioInfer.d0.s0\\\"\", \"passages\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0__text\\\", \\\"type\\\": \\\"Sentence\\\"...\", \"entities\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.e0\\\", \\\"type\\\": \\\"Individual_p...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.\n", "dataset_name": "bigbio/bioinfer"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biology_how_why_corpus": {"dataset_name": "bigbio/biology_how_why_corpus", "description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).", "downloads": 29, "configs": {"biology_how_why_corpus_source": {"config_name": "biology_how_why_corpus_source", "sample_row": "{\"text\": \"\\\"How does the second law of thermodynamics apply t...\", \"type\": \"\\\"how\\\"\", \"answers\": \"[{\\\"justification\\\": \\\"The second law of thermodynami...\"}", "columns": ["text", "type", "answers"], "columns_mapping": {"text": "text", "type": "type", "answers": "answers"}, "dataset_description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).\n", "dataset_name": "bigbio/biology_how_why_corpus"}, "biology_how_why_corpus_bigbio_qa": {"config_name": "biology_how_why_corpus_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"1_8_6\\\"\", \"question\": \"\\\"How does the second law of thermodynamics apply t...\", \"type\": \"\\\"how\\\"\", \"choices\": \"[]\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"The second law of thermodynamics states that spo...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).\n", "dataset_name": "bigbio/biology_how_why_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biomrc": {"dataset_name": "bigbio/biomrc", "description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.", "downloads": 12, "configs": {"biomrc_large_A_source": {"config_name": "biomrc_large_A_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"title\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D001943', 'Disease...\", \"entities_list.synonyms\": \"[\\\"['patients']\\\", \\\"['breast than lung cancer', 'bre...\", \"answer.pseudoidentifier\": \"\\\"@entity0\\\"\", \"answer.identifier\": \"\\\"(MESH:D001943,Disease)\\\"\", \"answer.synonyms\": \"\\\"['breast and lung cancer']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_A_bigbio_qa": {"config_name": "biomrc_large_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"context\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"answer\": \"[\\\"@entity0\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_A_source": {"config_name": "biomrc_small_A_source", "sample_row": "{\"abstract\": \"\\\"Single-agent activity for @entity8253 reflected b...\", \"title\": \"\\\"No synergistic activity of @entity1259 and XXXX i...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity632\\\", \\\"@entity137\\\", \\\"@entity4...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D004317', 'Chemica...\", \"entities_list.synonyms\": \"[\\\"['patients', 'patient']\\\", \\\"['Adriamycin']\\\", \\\"['t...\", \"answer.pseudoidentifier\": \"\\\"@entity4020\\\"\", \"answer.identifier\": \"\\\"(3440,Gene)\\\"\", \"answer.synonyms\": \"\\\"['interferon-alpha 2b']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_A_bigbio_qa": {"config_name": "biomrc_small_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"No synergistic activity of @entity1259 and XXXX i...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity632\\\", \\\"@entity137\\\", \\\"@entity4...\", \"context\": \"\\\"Single-agent activity for @entity8253 reflected b...\", \"answer\": \"[\\\"@entity4020\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_A_source": {"config_name": "biomrc_tiny_A_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Research into the optimal treatment o...\", \"title\": \"\\\"Radiographic classification and treatment of XXXX...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity85\\\", \\\"@entity82\\\", \\\"@entity319...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D009140', 'Disease...\", \"entities_list.synonyms\": \"[\\\"['patients']\\\", \\\"['valgus deformity', 'angular de...\", \"answer.pseudoidentifier\": \"\\\"@entity82\\\"\", \"answer.identifier\": \"\\\"(MESH:D005355,Disease)\\\"\", \"answer.synonyms\": \"\\\"['fibrous dysplasia']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_A_bigbio_qa": {"config_name": "biomrc_tiny_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Radiographic classification and treatment of XXXX...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity85\\\", \\\"@entity82\\\", \\\"@entity319...\", \"context\": \"\\\"BACKGROUND: Research into the optimal treatment o...\", \"answer\": \"[\\\"@entity82\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_B_source": {"config_name": "biomrc_large_B_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"title\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"answer.pseudoidentifier\": \"\\\"@entity0\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_B_bigbio_qa": {"config_name": "biomrc_large_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"context\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"answer\": \"[\\\"@entity0\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_B_source": {"config_name": "biomrc_small_B_source", "sample_row": "{\"abstract\": \"\\\"Single-agent activity for @entity12 reflected by ...\", \"title\": \"\\\"No synergistic activity of @entity7 and XXXX in t...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity0\\\", \\\"@entity6\\\", \\\"@entity2\\\", \\\"@entity5\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"...\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"...\", \"answer.pseudoidentifier\": \"\\\"@entity10\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_B_bigbio_qa": {"config_name": "biomrc_small_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"No synergistic activity of @entity7 and XXXX in t...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity0\\\", \\\"@entity6\\\", \\\"@entity2\\\", \\\"@entity5\\\", \\\"...\", \"context\": \"\\\"Single-agent activity for @entity12 reflected by ...\", \"answer\": \"[\\\"@entity10\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_B_source": {"config_name": "biomrc_tiny_B_source", "sample_row": "{\"abstract\": \"\\\"@entity3 ( @entity2 ) has been increasingly recog...\", \"title\": \"\\\"Breast-fed @entity0 achieve a higher rate of brai...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity0\\\", \\\"@entity3\\\", \\\"@entity1\\\", \\\"@entity2\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"answer.pseudoidentifier\": \"\\\"@entity3\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_B_bigbio_qa": {"config_name": "biomrc_tiny_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Breast-fed @entity0 achieve a higher rate of brai...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity0\\\", \\\"@entity3\\\", \\\"@entity1\\\", \\\"@entity2\\\", \\\"...\", \"context\": \"\\\"@entity3 ( @entity2 ) has been increasingly recog...\", \"answer\": \"[\\\"@entity3\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2011_epi": {"dataset_name": "bigbio/bionlp_st_2011_epi", "description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.", "downloads": 61, "configs": {"bionlp_st_2011_epi_source": {"config_name": "bionlp_st_2011_epi_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10190553\\\"\", \"text\": \"\\\"Regulation of connexin32 and connexin43 gene expr...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[14, ...\", \"events\": \"[{\\\"trigger\\\": \\\"T26\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"DNA_methy...\", \"relations\": \"[]\", \"equivalences\": \"[{\\\"id\\\": \\\"*\\\", \\\"ref_ids\\\": [\\\"T3\\\", \\\"T4\\\"]}, {\\\"id\\\": \\\"*\\\",...\", \"attributes\": \"[{\\\"id\\\": \\\"M1\\\", \\\"type\\\": \\\"Negation\\\", \\\"ref_id\\\": \\\"E3\\\", ...\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.\n", "dataset_name": "bigbio/bionlp_st_2011_epi"}, "bionlp_st_2011_epi_bigbio_kb": {"config_name": "bionlp_st_2011_epi_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10190553\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10190553__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10190553_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"of...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10190553_E1\\\", \\\"type\\\": \\\"DNA_methylati...\", \"coreferences\": \"[{\\\"id\\\": \\\"PMID-10190553_1\\\", \\\"entity_ids\\\": [\\\"PMID-10...\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.\n", "dataset_name": "bigbio/bionlp_st_2011_epi"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2011_ge": {"dataset_name": "bigbio/bionlp_st_2011_ge", "description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".", "downloads": 17, "configs": {"bionlp_st_2011_ge_source": {"config_name": "bionlp_st_2011_ge_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMC-1310901-00-TIAB\\\"\", \"text\": \"\\\"Down-regulation of interferon regulatory factor 4...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[19, ...\", \"events\": \"[{\\\"trigger\\\": \\\"T18\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"Negative_...\", \"relations\": \"[]\", \"equivalences\": \"[{\\\"id\\\": \\\"*\\\", \\\"ref_ids\\\": [\\\"T4\\\", \\\"T5\\\"]}]\", \"attributes\": \"[{\\\"id\\\": \\\"M1\\\", \\\"type\\\": \\\"Negation\\\", \\\"ref_id\\\": \\\"E3\\\", ...\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".\n", "dataset_name": "bigbio/bionlp_st_2011_ge"}, "bionlp_st_2011_ge_bigbio_kb": {"config_name": "bionlp_st_2011_ge_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMC-1310901-00-TIAB\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB__text\\\", \\\"type\\\": \\\"abst...\", \"entities\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_T1\\\", \\\"type\\\": \\\"Protein...\", \"events\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_E1\\\", \\\"type\\\": \\\"Negativ...\", \"coreferences\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_1\\\", \\\"entity_ids\\\": [\\\"P...\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".\n", "dataset_name": "bigbio/bionlp_st_2011_ge"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2013_gro": {"dataset_name": "bigbio/bionlp_st_2013_gro", "description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013", "downloads": 20, "configs": {"bionlp_st_2013_gro_source": {"config_name": "bionlp_st_2013_gro_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10025957\\\"\", \"text\": \"\\\"UCP4, a novel brain-specific mitochondrial protei...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0, 4...\", \"events\": \"[{\\\"trigger\\\": \\\"T11\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"CellularP...\", \"relations\": \"[{\\\"id\\\": \\\"R3\\\", \\\"type\\\": \\\"locatedIn\\\", \\\"head\\\": {\\\"role\\\"...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013\n", "dataset_name": "bigbio/bionlp_st_2013_gro"}, "bionlp_st_2013_gro_bigbio_kb": {"config_name": "bionlp_st_2013_gro_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10025957\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10025957__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10025957_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"of...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10025957_E1\\\", \\\"type\\\": \\\"CellularProce...\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"arg1_id\\\": \\\"PMID-10025957_T4\\\", \\\"arg2_id\\\": \\\"PMID-...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013\n", "dataset_name": "bigbio/bionlp_st_2013_gro"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2013_pc": {"dataset_name": "bigbio/bionlp_st_2013_pc", "description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.", "downloads": 59, "configs": {"bionlp_st_2013_pc_source": {"config_name": "bionlp_st_2013_pc_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10085159\\\"\", \"text\": \"\\\"The Cdc6 protein is ubiquitinated in vivo for pro...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Gene_or_gene_product\\\", \\\"off...\", \"events\": \"[{\\\"trigger\\\": \\\"T15\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"Ubiquitin...\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.\n", "dataset_name": "bigbio/bionlp_st_2013_pc"}, "bionlp_st_2013_pc_bigbio_kb": {"config_name": "bionlp_st_2013_pc_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10085159\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10085159__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10085159_T1\\\", \\\"type\\\": \\\"Gene_or_gene_...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10085159_E1\\\", \\\"type\\\": \\\"Ubiquitinatio...\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.\n", "dataset_name": "bigbio/bionlp_st_2013_pc"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2019_bb": {"dataset_name": "bigbio/bionlp_st_2019_bb", "description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.", "downloads": 107, "configs": {"bionlp_st_2019_bb_norm_source": {"config_name": "bionlp_st_2019_bb_norm_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-norm-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_norm+ner_source": {"config_name": "bionlp_st_2019_bb_norm+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-norm+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_rel_source": {"config_name": "bionlp_st_2019_bb_rel_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-rel-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_rel+ner_source": {"config_name": "bionlp_st_2019_bb_rel+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-rel+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_kb_source": {"config_name": "bionlp_st_2019_bb_kb_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_kb+ner_source": {"config_name": "bionlp_st_2019_bb_kb+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_bigbio_kb": {"config_name": "bionlp_st_2019_bb_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb+ner-1016123\\\"\", \"passages\": \"[{\\\"id\\\": \\\"BB-kb+ner-1016123__text\\\", \\\"type\\\": \\\"abstra...\", \"entities\": \"[{\\\"id\\\": \\\"BB-kb+ner-1016123_T3\\\", \\\"type\\\": \\\"Habitat\\\",...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"arg1_id\\\": \\\"BB-kb+ner-1016123_T5\\\", \\\"arg2_id\\\": \\\"B...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biorelex": {"dataset_name": "bigbio/biorelex", "description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "downloads": 68, "configs": {"biorelex_source": {"config_name": "biorelex_source", "sample_row": "{\"paperid\": \"\\\"24813911\\\"\", \"interactions\": \"[{\\\"participants\\\": [0, 2], \\\"type\\\": \\\"bind\\\", \\\"implici...\", \"url\": \"\\\"http://molpharm.aspetjournals.org/content/53/6/10...\", \"text\": \"\\\"Moreover, the in vitro binding of NF-\\\\u03baB or S...\", \"entities\": \"[{\\\"is_state\\\": false, \\\"label\\\": \\\"DNA\\\", \\\"names\\\": [{\\\"i...\", \"_line_\": \"7\", \"id\": \"\\\"1.0alpha7.train.0\\\"\"}", "columns": ["paperid", "interactions", "url", "text", "entities", "_line_", "id"], "columns_mapping": {"paperid": "paperid", "interactions": "interactions", "url": "url", "text": "text", "entities": "entities", "_line_": "_line_", "id": "id"}, "dataset_description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "dataset_name": "bigbio/biorelex"}, "biorelex_bigbio_kb": {"config_name": "biorelex_bigbio_kb", "sample_row": "{\"id\": \"\\\"1.0alpha7.train.0\\\"\", \"document_id\": \"\\\"24813911\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.sent\\\", \\\"type\\\": \\\"sentenc...\", \"entities\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.ent0_0\\\", \\\"type\\\": \\\"DNA\\\",...\", \"events\": \"[]\", \"coreferences\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.coref0\\\", \\\"entity_ids\\\": ...\", \"relations\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.rel0s1.0alpha7.train.0....\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "dataset_name": "bigbio/biorelex"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chebi_nactem": {"dataset_name": "bigbio/chebi_nactem", "description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.", "downloads": 120, "configs": {"chebi_nactem_abstr_ann1_source": {"config_name": "chebi_nactem_abstr_ann1_source", "sample_row": "{\"document_id\": \"\\\"10026165\\\"\", \"text\": \"\\\"3,4-Dihydroxyphenylalanine (Dopa) decarboxylase i...\", \"entities\": \"[{\\\"id\\\": \\\"T4\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['3,4-Di...\", \"relations\": \"[{\\\"id\\\": \\\"R4\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1\\\": \\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann1_bigbio_kb": {"config_name": "chebi_nactem_abstr_ann1_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10026165\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"3,4-Dihydroxyph...\", \"entities\": \"[{\\\"id\\\": \\\"1_T4\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0,...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R4\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1_i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann2_source": {"config_name": "chebi_nactem_abstr_ann2_source", "sample_row": "{\"document_id\": \"\\\"10026165\\\"\", \"text\": \"\\\"3,4-Dihydroxyphenylalanine (Dopa) decarboxylase i...\", \"entities\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['3,4-Di...\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1\\\": \\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann2_bigbio_kb": {"config_name": "chebi_nactem_abstr_ann2_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10026165\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"3,4-Dihydroxyph...\", \"entities\": \"[{\\\"id\\\": \\\"1_T3\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0,...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R1\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1_i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_fullpaper_source": {"config_name": "chebi_nactem_fullpaper_source", "sample_row": "{\"document_id\": \"\\\"10023770\\\"\", \"text\": \"\\\" The dogma of exclusive T cell recognition of pep...\", \"entities\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['major ...\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Binds_With\\\", \\\"arg1\\\": \\\"T156\\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_fullpaper_bigbio_kb": {"config_name": "chebi_nactem_fullpaper_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10023770\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\" The dogma of e...\", \"entities\": \"[{\\\"id\\\": \\\"1_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[54...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R1\\\", \\\"type\\\": \\\"Binds_With\\\", \\\"arg1_id\\\": \\\"...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chemprot": {"dataset_name": "bigbio/chemprot", "description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.", "downloads": 454, "configs": {"chemprot_full_source": {"config_name": "chemprot_full_source", "sample_row": "{\"pmid\": \"\\\"16357751\\\"\", \"text\": \"\\\"Selective costimulation modulators: a novel appro...\", \"entities.id\": \"[\\\"T1\\\", \\\"T2\\\", \\\"T3\\\", \\\"T4\\\", \\\"T5\\\"]\", \"entities.type\": \"[\\\"CHEMICAL\\\", \\\"GENE-N\\\", \\\"GENE-Y\\\", \\\"GENE-Y\\\", \\\"GENE-N...\", \"entities.text\": \"[\\\"methotrexate\\\", \\\"tumor necrosis factor\\\", \\\"CD80\\\", ...\", \"entities.offsets\": \"[[1342, 1354], [1364, 1385], [805, 809], [810, 814...\", \"relations.type\": \"[]\", \"relations.arg1\": \"[]\", \"relations.arg2\": \"[]\"}", "columns": ["pmid", "text", "entities_id", "entities_type", "entities_text", "entities_offsets", "relations_type", "relations_arg1", "relations_arg2"], "columns_mapping": {"pmid": "pmid", "text": "text", "entities.id": "entities_id", "entities.type": "entities_type", "entities.text": "entities_text", "entities.offsets": "entities_offsets", "relations.type": "relations_type", "relations.arg1": "relations_arg1", "relations.arg2": "relations_arg2"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}, "chemprot_shared_task_eval_source": {"config_name": "chemprot_shared_task_eval_source", "sample_row": "{\"pmid\": \"\\\"16357751\\\"\", \"text\": \"\\\"Selective costimulation modulators: a novel appro...\", \"entities.id\": \"[\\\"T1\\\", \\\"T2\\\", \\\"T3\\\", \\\"T4\\\", \\\"T5\\\"]\", \"entities.type\": \"[\\\"CHEMICAL\\\", \\\"GENE-N\\\", \\\"GENE-Y\\\", \\\"GENE-Y\\\", \\\"GENE-N...\", \"entities.text\": \"[\\\"methotrexate\\\", \\\"tumor necrosis factor\\\", \\\"CD80\\\", ...\", \"entities.offsets\": \"[[1342, 1354], [1364, 1385], [805, 809], [810, 814...\", \"relations.type\": \"[]\", \"relations.arg1\": \"[]\", \"relations.arg2\": \"[]\"}", "columns": ["pmid", "text", "entities_id", "entities_type", "entities_text", "entities_offsets", "relations_type", "relations_arg1", "relations_arg2"], "columns_mapping": {"pmid": "pmid", "text": "text", "entities.id": "entities_id", "entities.type": "entities_type", "entities.text": "entities_text", "entities.offsets": "entities_offsets", "relations.type": "relations_type", "relations.arg1": "relations_arg1", "relations.arg2": "relations_arg2"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}, "chemprot_bigbio_kb": {"config_name": "chemprot_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"16357751\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title and abstract\\\", \\\"text\\\":...\", \"entities\": \"[{\\\"offsets\\\": [[1342, 1354]], \\\"text\\\": [\\\"methotrexat...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chia": {"dataset_name": "bigbio/chia", "description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.", "downloads": 91, "configs": {"chia_source": {"config_name": "chia_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_fixed_source": {"config_name": "chia_fixed_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_without_scope_source": {"config_name": "chia_without_scope_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_without_scope_fixed_source": {"config_name": "chia_without_scope_fixed_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_bigbio_kb": {"config_name": "chia_bigbio_kb", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"passages\": \"[{\\\"id\\\": \\\"NCT00050349_exc_text\\\", \\\"type\\\": \\\"exclusion...\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/ddi_corpus": {"dataset_name": "bigbio/ddi_corpus", "description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.", "downloads": 357, "configs": {"ddi_corpus_source": {"config_name": "ddi_corpus_source", "sample_row": "{\"document_id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"text\": \"\\\"No drug, nutritional supplement, food or herb int...\", \"entities\": \"[]\", \"relations\": \"[]\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.\n", "dataset_name": "bigbio/ddi_corpus"}, "ddi_corpus_bigbio_kb": {"config_name": "ddi_corpus_bigbio_kb", "sample_row": "{\"id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"document_id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"passages\": \"[{\\\"id\\\": \\\"19-norandrostenedione_ddi__text\\\", \\\"type\\\":...\", \"entities\": \"[]\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.\n", "dataset_name": "bigbio/ddi_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/genetag": {"dataset_name": "bigbio/genetag", "description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..", "downloads": 25, "configs": {"genetaggold_source": {"config_name": "genetaggold_source", "sample_row": "{\"doc_id\": \"\\\"@@95229799480\\\"\", \"text\": \"\\\"Cervicovaginal foetal fibronectin in the predicti...\", \"tokenized_text\": \"[\\\"Cervicovaginal\\\", \\\"foetal\\\", \\\"fibronectin\\\", \\\"in\\\", ...\", \"pos_tags\": \"[\\\"JJ\\\", \\\"NEWGENE\\\", \\\"NEWGENE\\\", \\\"IN\\\", \\\"DT\\\", \\\"NN\\\", \\\"IN...\", \"entities\": \"[{\\\"text\\\": \\\"foetal fibronectin\\\", \\\"type\\\": \\\"NEWGENE\\\",...\"}", "columns": ["doc_id", "text", "tokenized_text", "pos_tags", "entities"], "columns_mapping": {"doc_id": "doc_id", "text": "text", "tokenized_text": "tokenized_text", "pos_tags": "pos_tags", "entities": "entities"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetaggold_bigbio_kb": {"config_name": "genetaggold_bigbio_kb", "sample_row": "{\"id\": \"\\\"@@95229799480\\\"\", \"document_id\": \"\\\"@@95229799480\\\"\", \"passages\": \"[{\\\"id\\\": \\\"@@95229799480_text\\\", \\\"type\\\": \\\"sentence\\\", ...\", \"entities\": \"[{\\\"offsets\\\": [[15, 33]], \\\"text\\\": [\\\"foetal fibronec...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetagcorrect_source": {"config_name": "genetagcorrect_source", "sample_row": "{\"doc_id\": \"\\\"@@95229799480\\\"\", \"text\": \"\\\"Cervicovaginal foetal fibronectin in the predicti...\", \"tokenized_text\": \"[\\\"Cervicovaginal\\\", \\\"foetal\\\", \\\"fibronectin\\\", \\\"in\\\", ...\", \"pos_tags\": \"[\\\"JJ\\\", \\\"NEWGENE\\\", \\\"NEWGENE\\\", \\\"IN\\\", \\\"DT\\\", \\\"NN\\\", \\\"IN...\", \"entities\": \"[{\\\"text\\\": \\\"fibronectin\\\", \\\"type\\\": \\\"NEWGENE\\\", \\\"token...\"}", "columns": ["doc_id", "text", "tokenized_text", "pos_tags", "entities"], "columns_mapping": {"doc_id": "doc_id", "text": "text", "tokenized_text": "tokenized_text", "pos_tags": "pos_tags", "entities": "entities"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetagcorrect_bigbio_kb": {"config_name": "genetagcorrect_bigbio_kb", "sample_row": "{\"id\": \"\\\"@@95229799480\\\"\", \"document_id\": \"\\\"@@95229799480\\\"\", \"passages\": \"[{\\\"id\\\": \\\"@@95229799480_text\\\", \\\"type\\\": \\\"sentence\\\", ...\", \"entities\": \"[{\\\"offsets\\\": [[22, 33]], \\\"text\\\": [\\\"fibronectin\\\"], ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/genia_term_corpus": {"dataset_name": "bigbio/genia_term_corpus", "description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.", "downloads": 65, "configs": {"genia_term_corpus_source": {"config_name": "genia_term_corpus_source", "sample_row": "{\"document_id\": \"\\\"95369245\\\"\", \"title\": \"[{\\\"text\\\": \\\"IL-2 gene expression and NF-kappa B act...\", \"abstract\": \"[{\\\"text\\\": \\\"Activation of the CD28 surface receptor...\"}", "columns": ["document_id", "title", "abstract"], "columns_mapping": {"document_id": "document_id", "title": "title", "abstract": "abstract"}, "dataset_description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.\n", "dataset_name": "bigbio/genia_term_corpus"}, "genia_term_corpus_bigbio_kb": {"config_name": "genia_term_corpus_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"95369245\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"IL-2 gene ...\", \"entities\": \"[{\\\"id\\\": \\\"3\\\", \\\"type\\\": \\\"other_name\\\", \\\"text\\\": [\\\"IL-2 ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.\n", "dataset_name": "bigbio/genia_term_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/hprd50": {"dataset_name": "bigbio/hprd50", "description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.", "downloads": 71, "configs": {"hprd50_source": {"config_name": "hprd50_source", "sample_row": "{\"id\": \"\\\"HPRD50.d0\\\"\", \"origId\": \"\\\"10373544\\\"\", \"set\": \"null\", \"sentences\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0\\\", \\\"origId\\\": \\\"10373544.1.1\\\", ...\"}", "columns": ["id", "origId", "set", "sentences"], "columns_mapping": {"id": "id", "origId": "origId", "set": "set", "sentences": "sentences"}, "dataset_description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.\n", "dataset_name": "bigbio/hprd50"}, "hprd50_bigbio_kb": {"config_name": "hprd50_bigbio_kb", "sample_row": "{\"id\": \"\\\"HPRD50.d0\\\"\", \"document_id\": \"\\\"10373544\\\"\", \"passages\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0\\\", \\\"type\\\": \\\"sentence\\\", \\\"text\\\"...\", \"entities\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0.e0\\\", \\\"text\\\": [\\\"TFIIIC102\\\"], ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.\n", "dataset_name": "bigbio/hprd50"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/iepa": {"dataset_name": "bigbio/iepa", "description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.", "downloads": 17, "configs": {"iepa_source": {"config_name": "iepa_source", "sample_row": "{\"id\": \"\\\"IEPA.d0\\\"\", \"PMID\": \"\\\"1645753\\\"\", \"origID\": \"\\\"258\\\"\", \"sentences\": \"[{\\\"id\\\": \\\"IEPA.d0.s0\\\", \\\"origID\\\": \\\"420\\\", \\\"offsets\\\": ...\"}", "columns": ["id", "PMID", "origID", "sentences"], "columns_mapping": {"id": "id", "PMID": "PMID", "origID": "origID", "sentences": "sentences"}, "dataset_description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.\n", "dataset_name": "bigbio/iepa"}, "iepa_bigbio_kb": {"config_name": "iepa_bigbio_kb", "sample_row": "{\"id\": \"\\\"IEPA.d0\\\"\", \"document_id\": \"\\\"1645753\\\"\", \"passages\": \"[{\\\"id\\\": \\\"IEPA.d0.s0\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"Oxytoc...\", \"entities\": \"[{\\\"id\\\": \\\"IEPA.d0.s0.e0\\\", \\\"text\\\": [\\\"Oxytocin\\\"], \\\"of...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"IEPA.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_id\\\":...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.\n", "dataset_name": "bigbio/iepa"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/lll": {"dataset_name": "bigbio/lll", "description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.", "downloads": 11, "configs": {"lll_source": {"config_name": "lll_source", "sample_row": "{\"id\": \"\\\"11069677-3\\\"\", \"sentence\": \"\\\"In vivo studies of the activity of four of the ki...\", \"words\": \"[{\\\"id\\\": \\\"0\\\", \\\"text\\\": \\\"In\\\", \\\"offsets\\\": [0, 2]}, {\\\"i...\", \"genic_interactions\": \"[{\\\"ref_id1\\\": \\\"29\\\", \\\"ref_id2\\\": \\\"35\\\"}, {\\\"ref_id1\\\": \\\"...\", \"agents\": \"[{\\\"ref_id\\\": \\\"29\\\"}, {\\\"ref_id\\\": \\\"31\\\"}]\", \"targets\": \"[{\\\"ref_id\\\": \\\"35\\\"}]\", \"lemmas\": \"[{\\\"ref_id\\\": \\\"0\\\", \\\"lemma\\\": \\\"in\\\"}, {\\\"ref_id\\\": \\\"1\\\", \\\"...\", \"syntactic_relations\": \"[{\\\"type\\\": \\\"comp_in:N-N\\\", \\\"ref_id1\\\": \\\"2\\\", \\\"ref_id2\\\"...\"}", "columns": ["id", "sentence", "words", "genic_interactions", "agents", "targets", "lemmas", "syntactic_relations"], "columns_mapping": {"id": "id", "sentence": "sentence", "words": "words", "genic_interactions": "genic_interactions", "agents": "agents", "targets": "targets", "lemmas": "lemmas", "syntactic_relations": "syntactic_relations"}, "dataset_description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.\n", "dataset_name": "bigbio/lll"}, "lll_bigbio_kb": {"config_name": "lll_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"11069677-3\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"In...\", \"entities\": \"[{\\\"id\\\": \\\"0-agent-29\\\", \\\"type\\\": \\\"agent\\\", \\\"text\\\": [\\\"K...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"2\\\", \\\"type\\\": \\\"genic_interaction\\\", \\\"arg1_id...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.\n", "dataset_name": "bigbio/lll"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/med_qa": {"dataset_name": "bigbio/med_qa", "description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.", "downloads": 2239, "configs": {"med_qa_en_source": {"config_name": "med_qa_en_source", "sample_row": "{\"meta_info\": \"\\\"step2&3\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"answer_idx\": \"\\\"E\\\"\", \"answer\": \"\\\"Nitrofurantoin\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Ampicillin\\\"}, {\\\"key\\\": \\\"B\\\",...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_bigbio_qa": {"config_name": "med_qa_en_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Ampicillin\\\", \\\"Ceftriaxone\\\", \\\"Ciprofloxacin\\\", \\\"Do...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Nitrofurantoin\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_4options_source": {"config_name": "med_qa_en_4options_source", "sample_row": "{\"meta_info\": \"\\\"step2&3\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"answer_idx\": \"\\\"D\\\"\", \"answer\": \"\\\"Nitrofurantoin\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Ampicillin\\\"}, {\\\"key\\\": \\\"B\\\",...\", \"metamap_phrases\": \"[\\\"23 year old pregnant woman\\\", \\\"weeks presents\\\", \\\"...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options", "metamap_phrases"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options", "metamap_phrases": "metamap_phrases"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_4options_bigbio_qa": {"config_name": "med_qa_en_4options_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Ampicillin\\\", \\\"Ceftriaxone\\\", \\\"Doxycycline\\\", \\\"Nitr...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Nitrofurantoin\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_source": {"config_name": "med_qa_zh_source", "sample_row": "{\"meta_info\": \"\\\"\\\\u7b2c\\\\u4e09\\\\u90e8\\\\u5206\\\\u3000\\\\u7cbe\\\\u795e\\\\u795e\\\\...\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"answer_idx\": \"\\\"B\\\"\", \"answer\": \"\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"190\\\\uff5e220mmH2O\\\\uff081.8...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_bigbio_qa": {"config_name": "med_qa_zh_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"190\\\\uff5e220mmH2O\\\\uff081.86\\\\uff5e2.16kPa\\\\uff09\\\",...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"]...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_4options_source": {"config_name": "med_qa_zh_4options_source", "sample_row": "{\"meta_info\": \"\\\"\\\\u7b2c\\\\u4e09\\\\u90e8\\\\u5206\\\\u3000\\\\u7cbe\\\\u795e\\\\u795e\\\\...\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"answer_idx\": \"\\\"A\\\"\", \"answer\": \"\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"80\\\\uff5e180mmH2O\\\\uff080.78...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_4options_bigbio_qa": {"config_name": "med_qa_zh_4options_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\", ...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"]...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_source": {"config_name": "med_qa_tw_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"\\\\u97cc\\\\u5e36\\\\u9b06\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027\\\\...\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7e2e\\\"}...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_bigbio_qa": {"config_name": "med_qa_tw_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7e2e\\\", \\\"\\\\u808c\\\\u529b\\\\u6e1b\\\\u...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"\\\\u97cc\\\\u5e36\\\\u9b06\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_en_source": {"config_name": "med_qa_tw_en_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"After the reaction physiology Which is not bedrid...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"Ligamentous laxity, increased ductility\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Muscle atrophy\\\"}, {\\\"key\\\": ...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_en_bigbio_qa": {"config_name": "med_qa_tw_en_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"After the reaction physiology Which is not bedrid...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Muscle atrophy\\\", \\\"Weakness\\\", \\\"Ligamentous laxity...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Ligamentous laxity, increased ductility\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_zh_source": {"config_name": "med_qa_tw_zh_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"\\\\u97e7\\\\u5e26\\\\u677e\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027\\\\...\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7f29\\\"}...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_zh_bigbio_qa": {"config_name": "med_qa_tw_zh_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7f29\\\", \\\"\\\\u808c\\\\u529b\\\\u51cf\\\\u...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"\\\\u97e7\\\\u5e26\\\\u677e\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}}, "tags": ["multilinguality:multilingual", "language:en", "language:zh"], "is_gated": false}, "bigbio/medmentions": {"dataset_name": "bigbio/medmentions", "description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.", "downloads": 367, "configs": {"medmentions_full_source": {"config_name": "medmentions_full_source", "sample_row": "{\"pmid\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as a modifier o...\", \"entities\": \"[{\\\"offsets\\\": [[0, 5]], \\\"text\\\": [\\\"DCTN4\\\"], \\\"semanti...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_full_bigbio_kb": {"config_name": "medmentions_full_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"id\\\": \\\"110\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"T116\\\", \\\"text\\\": [\\\"DCTN4\\\"], \\\"o...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_st21pv_source": {"config_name": "medmentions_st21pv_source", "sample_row": "{\"pmid\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as a modifier o...\", \"entities\": \"[{\\\"offsets\\\": [[0, 5]], \\\"text\\\": [\\\"DCTN4\\\"], \\\"semanti...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_st21pv_bigbio_kb": {"config_name": "medmentions_st21pv_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"id\\\": \\\"67\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as ...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"T103\\\", \\\"text\\\": [\\\"DCTN4\\\"], \\\"o...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/meqsum": {"dataset_name": "bigbio/meqsum", "description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.", "downloads": 24, "configs": {"meqsum_source": {"config_name": "meqsum_source", "sample_row": "{\"File\": \"\\\"1-131188152.xml.txt\\\"\", \"CHQ\": \"\\\"SUBJECT: who and where to get cetirizine - D\\\\nMES...\", \"Summary\": \"\\\"Who manufactures cetirizine?\\\"\"}", "columns": ["File", "CHQ", "Summary"], "columns_mapping": {"File": "File", "CHQ": "CHQ", "Summary": "Summary"}, "dataset_description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.\n", "dataset_name": "bigbio/meqsum"}, "meqsum_bigbio_t2t": {"config_name": "meqsum_bigbio_t2t", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"1-131188152.xml.txt\\\"\", \"text_1\": \"\\\"SUBJECT: who and where to get cetirizine - D\\\\nMES...\", \"text_2\": \"\\\"Who manufactures cetirizine?\\\"\", \"text_1_name\": \"\\\"\\\"\", \"text_2_name\": \"\\\"\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "text_1_name", "text_2_name"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "text_1_name": "text_1_name", "text_2_name": "text_2_name"}, "dataset_description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.\n", "dataset_name": "bigbio/meqsum"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/mirna": {"dataset_name": "bigbio/mirna", "description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively.", "downloads": 43, "configs": {"mirna_source": {"config_name": "mirna_source", "sample_row": "{\"passages\": \"[{\\\"document_id\\\": \\\"miRNA-corp.d0\\\", \\\"type\\\": \\\"title\\\",...\"}", "columns": ["passages"], "columns_mapping": {"passages": "passages"}, "dataset_description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively. \n", "dataset_name": "bigbio/mirna"}, "mirna_bigbio_kb": {"config_name": "mirna_bigbio_kb", "sample_row": "{\"id\": \"\\\"36\\\"\", \"document_id\": \"\\\"miRNA-corp.d0\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"Identifica...\", \"entities\": \"[{\\\"id\\\": \\\"7\\\", \\\"type\\\": \\\"Non-Specific_miRNAs\\\", \\\"text\\\"...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"27\\\", \\\"type\\\": \\\"Non-Specific_miRNAs-Disease...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively. \n", "dataset_name": "bigbio/mirna"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/mqp": {"dataset_name": "bigbio/mqp", "description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar", "downloads": 140, "configs": {"mqp_source": {"config_name": "mqp_source", "sample_row": "{\"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"text_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"\\\"1\\\"\"}", "columns": ["document_id", "text_1", "text_2", "label"], "columns_mapping": {"document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar\n", "dataset_name": "bigbio/mqp"}, "mqp_bigbio_pairs": {"config_name": "mqp_bigbio_pairs", "sample_row": "{\"id\": \"\\\"1\\\"\", \"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"text_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"\\\"1\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "label"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar\n", "dataset_name": "bigbio/mqp"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "greek_legal_code": {"dataset_name": "greek_legal_code", "description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.", "downloads": 661, "configs": {"volume": {"config_name": "volume", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"41\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}, "chapter": {"config_name": "chapter", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"239\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}, "subject": {"config_name": "subject", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"1405\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:el"], "is_gated": false}, "harem": {"dataset_name": "harem", "description": "The HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.", "downloads": 518, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"HAREM-871-07800\\\"\", \"tokens\": \"[\\\"Abra\\\\u00e7o\\\", \\\"P\\\\u00e1gina\\\", \\\"Principal\\\", \\\"ASSOC...\", \"ner_tags\": \"[3, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 3, 0, 0, 3...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThe HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.\n", "dataset_name": "harem"}, "selective": {"config_name": "selective", "sample_row": "{\"id\": \"\\\"HAREM-871-07800\\\"\", \"tokens\": \"[\\\"Abra\\\\u00e7o\\\", \\\"P\\\\u00e1gina\\\", \\\"Principal\\\", \\\"ASSOC...\", \"ner_tags\": \"[3, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 3, 0, 0, 3...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThe HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.\n", "dataset_name": "harem"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "has_part": {"dataset_name": "has_part", "description": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.", "downloads": 318, "configs": {"default": {"config_name": "default", "sample_row": "{\"arg1\": \"\\\"snowdrop\\\"\", \"arg2\": \"\\\"carpel\\\"\", \"score\": \"0.9990746974945068\", \"wikipedia_primary_page\": \"[\\\"Galanthus\\\"]\", \"synset\": \"[\\\"wn.carpel.n.01\\\"]\"}", "columns": ["arg1", "arg2", "score", "wikipedia_primary_page", "synset"], "columns_mapping": {"arg1": "arg1", "arg2": "arg2", "score": "score", "wikipedia_primary_page": "wikipedia_primary_page", "synset": "synset"}, "dataset_description": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.\n", "dataset_name": "has_part"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|other-Generics-KB", "language:en", "Meronym-Prediction"], "is_gated": false}, "hate_speech18": {"dataset_name": "hate_speech18", "description": "These files contain text extracted from Stormfront, a white supremacist forum. A random set of\nforums posts have been sampled from several subforums and split into sentences. Those sentences\nhave been manually labelled as containing hate speech or not, according to certain annotation guidelines.", "downloads": 12316, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"As of March 13th , 2014 , the booklet had been do...\", \"user_id\": \"572066\", \"subforum_id\": \"1346\", \"num_contexts\": \"0\", \"label\": \"0\"}", "columns": ["text", "user_id", "subforum_id", "num_contexts", "label"], "columns_mapping": {"text": "text", "user_id": "user_id", "subforum_id": "subforum_id", "num_contexts": "num_contexts", "label": "label"}, "dataset_description": "These files contain text extracted from Stormfront, a white supremacist forum. A random set of\nforums posts have been sampled from several subforums and split into sentences. Those sentences\nhave been manually labelled as containing hate speech or not, according to certain annotation guidelines.\n", "dataset_name": "hate_speech18"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "hate_speech_filipino": {"dataset_name": "hate_speech_filipino", "description": " Contains 10k tweets (training set) that are labeled as hate speech or non-hate speech. Released with 4,232 validation and 4,232 testing samples. Collected during the 2016 Philippine Presidential Elections.", "downloads": 314, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Inaasahan na ni Vice President Jejomar Binay na m...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": " Contains 10k tweets (training set) that are labeled as hate speech or non-hate speech. Released with 4,232 validation and 4,232 testing samples. Collected during the 2016 Philippine Presidential Elections.\n", "dataset_name": "hate_speech_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-analysis", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|other-twitter-data-philippine-election", "language:tl"], "is_gated": false}, "hate_speech_offensive": {"dataset_name": "hate_speech_offensive", "description": "An annotated dataset for hate speech and offensive language detection on tweets.", "downloads": 8425, "configs": {"default": {"config_name": "default", "sample_row": "{\"count\": \"3\", \"hate_speech_count\": \"0\", \"offensive_language_count\": \"0\", \"neither_count\": \"3\", \"class\": \"2\", \"tweet\": \"\\\"!!! RT @mayasolovely: As a woman you shouldn't co...\"}", "columns": ["count", "hate_speech_count", "offensive_language_count", "neither_count", "class", "tweet"], "columns_mapping": {"count": "count", "hate_speech_count": "hate_speech_count", "offensive_language_count": "offensive_language_count", "neither_count": "neither_count", "class": "class", "tweet": "tweet"}, "dataset_description": "An annotated dataset for hate speech and offensive language detection on tweets.\n", "dataset_name": "hate_speech_offensive"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "hate-speech-detection"], "is_gated": false}, "hate_speech_pl": {"dataset_name": "hate_speech_pl", "description": "HateSpeech corpus in the current version contains over 2000 posts crawled from public Polish web. They represent various types and degrees of offensive language, expressed toward minorities (eg. ethnical, racial). The data were annotated manually.", "downloads": 314, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"1\", \"text_id\": \"121713\", \"annotator_id\": \"1\", \"minority_id\": \"72\", \"negative_emotions\": \"true\", \"call_to_action\": \"true\", \"source_of_knowledge\": \"2\", \"irony_sarcasm\": \"true\", \"topic\": \"18\", \"text\": \"\\\" Niemiec m\\\\u00f3wi c...\", \"rating\": \"0\"}", "columns": ["id", "text_id", "annotator_id", "minority_id", "negative_emotions", "call_to_action", "source_of_knowledge", "irony_sarcasm", "topic", "text", "rating"], "columns_mapping": {"id": "id", "text_id": "text_id", "annotator_id": "annotator_id", "minority_id": "minority_id", "negative_emotions": "negative_emotions", "call_to_action": "call_to_action", "source_of_knowledge": "source_of_knowledge", "irony_sarcasm": "irony_sarcasm", "topic": "topic", "text": "text", "rating": "rating"}, "dataset_description": "HateSpeech corpus in the current version contains over 2000 posts crawled from public Polish web. They represent various types and degrees of offensive language, expressed toward minorities (eg. ethnical, racial). The data were annotated manually.\n", "dataset_name": "hate_speech_pl"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "task_ids:sentiment-classification", "task_ids:sentiment-scoring", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "hate_speech_portuguese": {"dataset_name": "hate_speech_portuguese", "description": "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate').", "downloads": 322, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"@__andrea__b \\\\nO cara vive em outro mundo\\\\nN\\\\u00e...\", \"label\": \"1\", \"hatespeech_G1\": \"\\\"1\\\"\", \"annotator_G1\": \"\\\"A\\\"\", \"hatespeech_G2\": \"\\\"1\\\"\", \"annotator_G2\": \"\\\"V\\\"\", \"hatespeech_G3\": \"\\\"0\\\"\", \"annotator_G3\": \"\\\"E\\\"\"}", "columns": ["text", "label", "hatespeech_G1", "annotator_G1", "hatespeech_G2", "annotator_G2", "hatespeech_G3", "annotator_G3"], "columns_mapping": {"text": "text", "label": "label", "hatespeech_G1": "hatespeech_G1", "annotator_G1": "annotator_G1", "hatespeech_G2": "hatespeech_G2", "annotator_G2": "annotator_G2", "hatespeech_G3": "hatespeech_G3", "annotator_G3": "annotator_G3"}, "dataset_description": "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate').\n", "dataset_name": "hate_speech_portuguese"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "hate-speech-detection"], "is_gated": false}, "hatexplain": {"dataset_name": "hatexplain", "description": "Hatexplain is the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.", "downloads": 1393, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"23107796_gab\\\"\", \"annotators.label\": \"[0, 2, 2]\", \"annotators.annotator_id\": \"[203, 204, 233]\", \"annotators.target\": \"[[\\\"Hindu\\\", \\\"Islam\\\"], [\\\"Hindu\\\", \\\"Islam\\\"], [\\\"Hindu\\\",...\", \"rationales\": \"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...\", \"post_tokens\": \"[\\\"u\\\", \\\"really\\\", \\\"think\\\", \\\"i\\\", \\\"would\\\", \\\"not\\\", \\\"hav...\"}", "columns": ["id", "annotators_label", "annotators_annotator_id", "annotators_target", "rationales", "post_tokens"], "columns_mapping": {"id": "id", "annotators.label": "annotators_label", "annotators.annotator_id": "annotators_annotator_id", "annotators.target": "annotators_target", "rationales": "rationales", "post_tokens": "post_tokens"}, "dataset_description": "Hatexplain is the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.\n", "dataset_name": "hatexplain"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "hate-speech-detection"], "is_gated": false}, "hausa_voa_ner": {"dataset_name": "hausa_voa_ner", "description": "The Hausa VOA NER dataset is a labeled dataset for named entity recognition in Hausa. The texts were obtained from\nHausa Voice of America News articles https://www.voahausa.com/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Hausa VOA NER data files contain 2 columns separated by a tab ('\\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.emnlp-main.204/", "downloads": 288, "configs": {"hausa_voa_ner": {"config_name": "hausa_voa_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ya\\\", \\\"Kammala\\\", \\\"Ziyarar\\\", \\\"Yakin\\\", \\\"Neman\\\", \\\"Za...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 5]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The Hausa VOA NER dataset is a labeled dataset for named entity recognition in Hausa. The texts were obtained from\nHausa Voice of America News articles https://www.voahausa.com/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Hausa VOA NER data files contain 2 columns separated by a tab ('\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.emnlp-main.204/\n", "dataset_name": "hausa_voa_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ha"], "is_gated": false}, "hausa_voa_topics": {"dataset_name": "hausa_voa_topics", "description": "A collection of news article headlines in Hausa from VOA Hausa.\nEach headline is labeled with one of the following classes: Nigeria,\nAfrica, World, Health or Politics.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).", "downloads": 311, "configs": {"default": {"config_name": "default", "sample_row": "{\"news_title\": \"\\\"Atiku Abubakar Ya Kada Kuri'arsa A Jimeta A Jihar...\", \"label\": \"3\"}", "columns": ["news_title", "label"], "columns_mapping": {"news_title": "news_title", "label": "label"}, "dataset_description": "A collection of news article headlines in Hausa from VOA Hausa.\nEach headline is labeled with one of the following classes: Nigeria,\nAfrica, World, Health or Politics.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).\n", "dataset_name": "hausa_voa_topics"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ha"], "is_gated": false}, "head_qa": {"dataset_name": "head_qa", "description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.", "downloads": 1148, "configs": {"es": {"config_name": "es", "sample_row": "{\"name\": \"\\\"Cuaderno_2013_1_B\\\"\", \"year\": \"\\\"2013\\\"\", \"category\": \"\\\"biology\\\"\", \"qid\": \"1\", \"qtext\": \"\\\"Los potenciales postsin\\\\u00e1pticos excitadores:\\\"...\", \"ra\": \"3\", \"image\": \"null\", \"answers\": \"[{\\\"aid\\\": 1, \\\"atext\\\": \\\"Son de tipo todo o nada.\\\"}, ...\"}", "columns": ["dataset_name", "year", "category", "qid", "qtext", "ra", "image", "answers"], "columns_mapping": {"dataset_name": "dataset_name", "year": "year", "category": "category", "qid": "qid", "qtext": "qtext", "ra": "ra", "image": "image", "answers": "answers"}, "dataset_description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "dataset_name": "head_qa"}, "en": {"config_name": "en", "sample_row": "{\"name\": \"\\\"Cuaderno_2013_1_B\\\"\", \"year\": \"\\\"2013\\\"\", \"category\": \"\\\"biology\\\"\", \"qid\": \"1\", \"qtext\": \"\\\"The excitatory postsynaptic potentials:\\\"\", \"ra\": \"3\", \"image\": \"null\", \"answers\": \"[{\\\"aid\\\": 1, \\\"atext\\\": \\\"They are all or nothing.\\\"}, ...\"}", "columns": ["dataset_name", "year", "category", "qid", "qtext", "ra", "image", "answers"], "columns_mapping": {"dataset_name": "dataset_name", "year": "year", "category": "category", "qid": "qid", "qtext": "qtext", "ra": "ra", "image": "image", "answers": "answers"}, "dataset_description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "dataset_name": "head_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "language:es"], "is_gated": false}, "hebrew_projectbenyehuda": {"dataset_name": "hebrew_projectbenyehuda", "description": "This repository contains a dump of thousands of public domain works in Hebrew, from Project Ben-Yehuda, in plaintext UTF-8 files, with and without diacritics (nikkud). The metadata (pseudocatalogue.csv) file is a list of titles, authors, genres, and file paths, to help you process the dump.\nAll these works are in the public domain, so you are free to make any use of them, and do not need to ask for permission.\nThere are 10078 files, 3181136 lines", "downloads": 290, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"10\", \"url\": \"\\\"https://raw.githubusercontent.com/projectbenyehud...\", \"title\": \"\\\"\\\\u05d7\\\\u05e6\\\\u05d9-\\\\u05e0\\\\u05d7\\\\u05de\\\\u05d4\\\"\", \"authors\": \"\\\"\\\\u05d0\\\\u05d7\\\\u05d3 \\\\u05d4\\\\u05e2\\\\u05dd\\\"\", \"translators\": \"\\\"\\\"\", \"original_language\": \"\\\"380425\\\"\", \"genre\": \"\\\"\\\"\", \"source_edition\": \"\\\"\\\"\", \"text\": \"\\\"\\\\n\\\\n\\\\n\\\\t\\\\n\\\\t\\\\u05d7\\\\u05e6\\\\u05d9-\\\\u05e0\\\\u05d7\\\\u05de...\"}", "columns": ["id", "url", "title", "authors", "translators", "original_language", "genre", "source_edition", "text"], "columns_mapping": {"id": "id", "url": "url", "title": "title", "authors": "authors", "translators": "translators", "original_language": "original_language", "genre": "genre", "source_edition": "source_edition", "text": "text"}, "dataset_description": "This repository contains a dump of thousands of public domain works in Hebrew, from Project Ben-Yehuda, in plaintext UTF-8 files, with and without diacritics (nikkud). The metadata (pseudocatalogue.csv) file is a list of titles, authors, genres, and file paths, to help you process the dump.\nAll these works are in the public domain, so you are free to make any use of them, and do not need to ask for permission.\nThere are 10078 files, 3181136 lines\n", "dataset_name": "hebrew_projectbenyehuda"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:he"], "is_gated": false}, "hebrew_sentiment": {"dataset_name": "hebrew_sentiment", "description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).", "downloads": 500, "configs": {"token": {"config_name": "token", "sample_row": "{\"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9 \\\\u05db\\\\u05d5\\\\u05d0\\\\u05d1 ........\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).\n", "dataset_name": "hebrew_sentiment"}, "morph": {"config_name": "morph", "sample_row": "{\"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9 \\\\u05db\\\\u05d5\\\\u05d0\\\\u05d1 ........\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).\n", "dataset_name": "hebrew_sentiment"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:he"], "is_gated": false}, "Rowan/hellaswag": {"dataset_name": "Rowan/hellaswag", "description": "HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.", "downloads": 2165, "configs": {"default": {"config_name": "default", "sample_row": "{\"ind\": \"4\", \"activity_label\": \"\\\"Removing ice from car\\\"\", \"ctx_a\": \"\\\"Then, the man writes over the snow covering the w...\", \"ctx_b\": \"\\\"then\\\"\", \"ctx\": \"\\\"Then, the man writes over the snow covering the w...\", \"endings\": \"[\\\", the man adds wax to the windshield and cuts it...\", \"source_id\": \"\\\"activitynet~v_-1IBHYS3L-Y\\\"\", \"split\": \"\\\"train\\\"\", \"split_type\": \"\\\"indomain\\\"\", \"label\": \"\\\"3\\\"\"}", "columns": ["ind", "activity_label", "ctx_a", "ctx_b", "ctx", "endings", "source_id", "split", "split_type", "label"], "columns_mapping": {"ind": "ind", "activity_label": "activity_label", "ctx_a": "ctx_a", "ctx_b": "ctx_b", "ctx": "ctx", "endings": "endings", "source_id": "source_id", "split": "split", "split_type": "split_type", "label": "label"}, "dataset_description": "\nHellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.\n", "dataset_name": "Rowan/hellaswag"}}, "tags": ["language:en"], "is_gated": false}, "hind_encorp": {"dataset_name": "hind_encorp", "description": "HindEnCorp parallel texts (sentence-aligned) come from the following sources:\nTides, which contains 50K sentence pairs taken mainly from news articles. This dataset was originally col- lected for the DARPA-TIDES surprise-language con- test in 2002, later refined at IIIT Hyderabad and provided for the NLP Tools Contest at ICON 2008 (Venkatapathy, 2008).\n\nCommentaries by Daniel Pipes contain 322 articles in English written by a journalist Daniel Pipes and translated into Hindi.\n\nEMILLE. This corpus (Baker et al., 2002) consists of three components: monolingual, parallel and annotated corpora. There are fourteen monolingual sub- corpora, including both written and (for some lan- guages) spoken data for fourteen South Asian lan- guages. The EMILLE monolingual corpora contain in total 92,799,000 words (including 2,627,000 words of transcribed spoken data for Bengali, Gujarati, Hindi, Punjabi and Urdu). The parallel corpus consists of 200,000 words of text in English and its accompanying translations into Hindi and other languages.\n\nSmaller datasets as collected by Bojar et al. (2010) include the corpus used at ACL 2005 (a subcorpus of EMILLE), a corpus of named entities from Wikipedia (crawled in 2009), and Agriculture domain parallel corpus.\n\ufffc\nFor the current release, we are extending the parallel corpus using these sources:\nIntercorp (\u010cerm\u00e1k and Rosen,2012) is a large multilingual parallel corpus of 32 languages including Hindi. The central language used for alignment is Czech. Intercorp\u2019s core texts amount to 202 million words. These core texts are most suitable for us because their sentence alignment is manually checked and therefore very reliable. They cover predominately short sto- ries and novels. There are seven Hindi texts in Inter- corp. Unfortunately, only for three of them the English translation is available; the other four are aligned only with Czech texts. The Hindi subcorpus of Intercorp contains 118,000 words in Hindi.\n\nTED talks 3 held in various languages, primarily English, are equipped with transcripts and these are translated into 102 languages. There are 179 talks for which Hindi translation is available.\n\nThe Indic multi-parallel corpus (Birch et al., 2011; Post et al., 2012) is a corpus of texts from Wikipedia translated from the respective Indian language into English by non-expert translators hired over Mechanical Turk. The quality is thus somewhat mixed in many respects starting from typesetting and punctuation over capi- talization, spelling, word choice to sentence structure. A little bit of control could be in principle obtained from the fact that every input sentence was translated 4 times. We used the 2012 release of the corpus.\n\nLaunchpad.net is a software collaboration platform that hosts many open-source projects and facilitates also collaborative localization of the tools. We downloaded all revisions of all the hosted projects and extracted the localization (.po) files.\n\nOther smaller datasets. This time, we added Wikipedia entities as crawled in 2013 (including any morphological variants of the named entitity that appears on the Hindi variant of the Wikipedia page) and words, word examples and quotes from the Shabdkosh online dictionary.", "downloads": 301, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"source\": \"\\\"wikiner2013inflected\\\"\", \"alignment_type\": \"\\\"1-1\\\"\", \"alignment_quality\": \"\\\"1.000\\\"\", \"translation.en\": \"\\\"Sharaabi\\\"\", \"translation.hi\": \"\\\"\\\\u0936\\\\u0930\\\\u093e\\\\u092c\\\\u0940\\\"\"}", "columns": ["id", "source", "alignment_type", "alignment_quality", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "source": "source", "alignment_type": "alignment_type", "alignment_quality": "alignment_quality", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "HindEnCorp parallel texts (sentence-aligned) come from the following sources:\nTides, which contains 50K sentence pairs taken mainly from news articles. This dataset was originally col- lected for the DARPA-TIDES surprise-language con- test in 2002, later refined at IIIT Hyderabad and provided for the NLP Tools Contest at ICON 2008 (Venkatapathy, 2008).\n\nCommentaries by Daniel Pipes contain 322 articles in English written by a journalist Daniel Pipes and translated into Hindi.\n\nEMILLE. This corpus (Baker et al., 2002) consists of three components: monolingual, parallel and annotated corpora. There are fourteen monolingual sub- corpora, including both written and (for some lan- guages) spoken data for fourteen South Asian lan- guages. The EMILLE monolingual corpora contain in total 92,799,000 words (including 2,627,000 words of transcribed spoken data for Bengali, Gujarati, Hindi, Punjabi and Urdu). The parallel corpus consists of 200,000 words of text in English and its accompanying translations into Hindi and other languages.\n\nSmaller datasets as collected by Bojar et al. (2010) include the corpus used at ACL 2005 (a subcorpus of EMILLE), a corpus of named entities from Wikipedia (crawled in 2009), and Agriculture domain parallel corpus.\n\ufffc\nFor the current release, we are extending the parallel corpus using these sources:\nIntercorp (\u010cerm\u00e1k and Rosen,2012) is a large multilingual parallel corpus of 32 languages including Hindi. The central language used for alignment is Czech. Intercorp\u2019s core texts amount to 202 million words. These core texts are most suitable for us because their sentence alignment is manually checked and therefore very reliable. They cover predominately short sto- ries and novels. There are seven Hindi texts in Inter- corp. Unfortunately, only for three of them the English translation is available; the other four are aligned only with Czech texts. The Hindi subcorpus of Intercorp contains 118,000 words in Hindi.\n\nTED talks 3 held in various languages, primarily English, are equipped with transcripts and these are translated into 102 languages. There are 179 talks for which Hindi translation is available.\n\nThe Indic multi-parallel corpus (Birch et al., 2011; Post et al., 2012) is a corpus of texts from Wikipedia translated from the respective Indian language into English by non-expert translators hired over Mechanical Turk. The quality is thus somewhat mixed in many respects starting from typesetting and punctuation over capi- talization, spelling, word choice to sentence structure. A little bit of control could be in principle obtained from the fact that every input sentence was translated 4 times. We used the 2012 release of the corpus.\n\nLaunchpad.net is a software collaboration platform that hosts many open-source projects and facilitates also collaborative localization of the tools. We downloaded all revisions of all the hosted projects and extracted the localization (.po) files.\n\nOther smaller datasets. This time, we added Wikipedia entities as crawled in 2013 (including any morphological variants of the named entitity that appears on the Hindi variant of the Wikipedia page) and words, word examples and quotes from the Shabdkosh online dictionary.\n", "dataset_name": "hind_encorp"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:hi"], "is_gated": false}, "hindi_discourse": {"dataset_name": "hindi_discourse", "description": "The Hindi Discourse Analysis dataset is a corpus for analyzing discourse modes present in its sentences.\nIt contains sentences from stories written by 11 famous authors from the 20th Century.\n4-5 stories by each author have been selected which were available in the public domain resulting\nin a collection of 53 stories. Most of these short stories were originally written in Hindi\nbut some of them were written in other Indian languages and later translated to Hindi.", "downloads": 297, "configs": {"default": {"config_name": "default", "sample_row": "{\"Story_no\": \"0\", \"Sentence\": \"\\\"\\\\u091a\\\\u0947\\\\u0939\\\\u0930\\\\u0947 \\\\u092a\\\\u0930 \\\\u090...\", \"Discourse Mode\": \"1\"}", "columns": ["Story_no", "Sentence", "Discourse Mode"], "columns_mapping": {"Story_no": "Story_no", "Sentence": "Sentence", "Discourse Mode": "Discourse Mode"}, "dataset_description": "The Hindi Discourse Analysis dataset is a corpus for analyzing discourse modes present in its sentences.\nIt contains sentences from stories written by 11 famous authors from the 20th Century.\n4-5 stories by each author have been selected which were available in the public domain resulting\nin a collection of 53 stories. Most of these short stories were originally written in Hindi\nbut some of them were written in other Indian languages and later translated to Hindi.\n", "dataset_name": "hindi_discourse"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:hi", "discourse-analysis"], "is_gated": false}, "hkcancor": {"dataset_name": "hkcancor", "description": "The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations\nrecorded between March 1997 and August 1998. It contains recordings of\nspontaneous speech (51 texts) and radio programmes (42 texts),\nwhich involve 2 to 4 speakers, with 1 text of monologue.\n\nIn total, the corpus contains around 230,000 Chinese words.\nThe text is word-segmented, annotated with part-of-speech (POS) tags and\nromanised Cantonese pronunciation.\n\nRomanisation scheme - Linguistic Society of Hong Kong (LSHK)\nPOS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),\n with extended tags for Cantonese-specific phenomena added by\n Luke and Wang (see original paper for details).", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"conversation_id\": \"\\\"TN001-DR300497-WAI3C\\\"\", \"speaker\": \"\\\"A\\\"\", \"turn_number\": \"0\", \"tokens\": \"[\\\"\\\\u5582\\\", \\\"\\\\u9072\\\", \\\"\\\\u5572\\\", \\\"\\\\u53bb\\\", \\\"\\\\u5514\\\",...\", \"transcriptions\": \"[\\\"wai3\\\", \\\"ci4\\\", \\\"di1\\\", \\\"heoi3\\\", \\\"m4\\\", \\\"heoi3\\\", \\\"le...\", \"pos_tags_prf\": \"[24, 9, 72, 75, 21, 75, 80, 116, 83, 64, 50, 76, 9...\", \"pos_tags_ud\": \"[15, 0, 6, 11, 10, 11, 8, 6, 2, 14, 8, 11, 0, 8, 6...\"}", "columns": ["conversation_id", "speaker", "turn_number", "tokens", "transcriptions", "pos_tags_prf", "pos_tags_ud"], "columns_mapping": {"conversation_id": "conversation_id", "speaker": "speaker", "turn_number": "turn_number", "tokens": "tokens", "transcriptions": "transcriptions", "pos_tags_prf": "pos_tags_prf", "pos_tags_ud": "pos_tags_ud"}, "dataset_description": "The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations\nrecorded between March 1997 and August 1998. It contains recordings of\nspontaneous speech (51 texts) and radio programmes (42 texts),\nwhich involve 2 to 4 speakers, with 1 text of monologue.\n\nIn total, the corpus contains around 230,000 Chinese words.\nThe text is word-segmented, annotated with part-of-speech (POS) tags and\nromanised Cantonese pronunciation.\n\nRomanisation scheme - Linguistic Society of Hong Kong (LSHK)\nPOS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),\n with extended tags for Cantonese-specific phenomena added by\n Luke and Wang (see original paper for details).\n", "dataset_name": "hkcancor"}}, "tags": ["task_categories:translation", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:yue"], "is_gated": false}, "hlgd": {"dataset_name": "hlgd", "description": "HLGD is a binary classification dataset consisting of 20,056 labeled news headlines pairs indicating\nwhether the two headlines describe the same underlying world event or not.", "downloads": 365, "configs": {"default": {"config_name": "default", "sample_row": "{\"timeline_id\": \"9\", \"headline_a\": \"\\\"Seven bodies found after dam burst at Brazil mine...\", \"headline_b\": \"\\\"Fears rise for 300 missing in Brazil dam disaster...\", \"date_a\": \"\\\"2019-01-25\\\"\", \"date_b\": \"\\\"2019-01-26\\\"\", \"url_a\": \"\\\"https://www.reuters.com/article/us-brazil-vale-di...\", \"url_b\": \"\\\"https://timesofindia.indiatimes.com/world/rest-of...\", \"label\": \"0\"}", "columns": ["timeline_id", "headline_a", "headline_b", "date_a", "date_b", "url_a", "url_b", "label"], "columns_mapping": {"timeline_id": "timeline_id", "headline_a": "headline_a", "headline_b": "headline_b", "date_a": "date_a", "date_b": "date_b", "url_a": "url_a", "url_b": "url_b", "label": "label"}, "dataset_description": "HLGD is a binary classification dataset consisting of 20,056 labeled news headlines pairs indicating\nwhether the two headlines describe the same underlying world event or not.\n", "dataset_name": "hlgd"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "headline-grouping"], "is_gated": false}, "hover": {"dataset_name": "hover", "description": "HoVer is an open-domain, many-hop fact extraction and claim verification dataset built upon the Wikipedia corpus. The original 2-hop claims are adapted from question-answer pairs from HotpotQA. It is collected by a team of NLP researchers at UNC Chapel Hill and Verisk Analytics.", "downloads": 303, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"uid\": \"\\\"330ca632-e83f-4011-b11b-0d0158145036\\\"\", \"claim\": \"\\\"Skagen Painter Peder Severin Kr\\\\u00f8yer favored ...\", \"supporting_facts\": \"[{\\\"key\\\": \\\"Kristian Zahrtmann\\\", \\\"value\\\": 0}, {\\\"key\\\"...\", \"label\": \"1\", \"num_hops\": \"3\", \"hpqa_id\": \"\\\"5ab7a86d5542995dae37e986\\\"\"}", "columns": ["id", "uid", "claim", "supporting_facts", "label", "num_hops", "hpqa_id"], "columns_mapping": {"id": "id", "uid": "uid", "claim": "claim", "supporting_facts": "supporting_facts", "label": "label", "num_hops": "num_hops", "hpqa_id": "hpqa_id"}, "dataset_description": "HoVer is an open-domain, many-hop fact extraction and claim verification dataset built upon the Wikipedia corpus. The original 2-hop claims are adapted from question-answer pairs from HotpotQA. It is collected by a team of NLP researchers at UNC Chapel Hill and Verisk Analytics.\n", "dataset_name": "hover"}}, "tags": ["task_categories:text-retrieval", "task_ids:fact-checking-retrieval", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "hrenwac_para": {"dataset_name": "hrenwac_para", "description": "The hrenWaC corpus version 2.0 consists of parallel Croatian-English texts crawled from the .hr top-level domain for Croatia.\nThe corpus was built with Spidextor (https://github.com/abumatran/spidextor), a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.", "downloads": 288, "configs": {"hrenWaC": {"config_name": "hrenWaC", "sample_row": "{\"translation.en\": \"\\\"There is probably no person in the world that the...\", \"translation.hr\": \"\\\"Vjerojatno ne postoji osoba na svijetu koja na vi...\"}", "columns": ["translation_en", "translation_hr"], "columns_mapping": {"translation.en": "translation_en", "translation.hr": "translation_hr"}, "dataset_description": "\nThe hrenWaC corpus version 2.0 consists of parallel Croatian-English texts crawled from the .hr top-level domain for Croatia.\nThe corpus was built with Spidextor (https://github.com/abumatran/spidextor), a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.\n", "dataset_name": "hrenwac_para"}}, "tags": ["task_categories:translation", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:en", "language:hr"], "is_gated": false}, "humicroedit": {"dataset_name": "humicroedit", "description": "This new dataset is designed to assess the funniness of edited news headlines.", "downloads": 595, "configs": {"subtask-1": {"config_name": "subtask-1", "sample_row": "{\"id\": \"\\\"14530\\\"\", \"original\": \"\\\"France is \\\\u2018 hunting down its citizens who jo...\", \"edit\": \"\\\"twins\\\"\", \"grades\": \"\\\"10000\\\"\", \"meanGrade\": \"0.2\"}", "columns": ["id", "original", "edit", "grades", "meanGrade"], "columns_mapping": {"id": "id", "original": "original", "edit": "edit", "grades": "grades", "meanGrade": "meanGrade"}, "dataset_description": "This new dataset is designed to assess the funniness of edited news headlines.\n", "dataset_name": "humicroedit"}, "subtask-2": {"config_name": "subtask-2", "sample_row": "{\"id\": \"\\\"10920-9866\\\"\", \"original1\": \"\\\"\\\\\\\" Gene Cernan , Last on the Moon , ...\", \"edit1\": \"\\\"Dancer\\\"\", \"grades1\": \"\\\"01113\\\"\", \"meanGrade1\": \"1.2\", \"original2\": \"\\\"\\\\\\\" Gene Cernan , Last Astronaut on the Moon , \\\"\", \"paraphrased_question\": \"\\\"What is Delta Air Line's periodical literature mo...\"}", "columns": ["NNQT_question", "uid", "subgraph", "template_index", "question", "sparql_wikidata", "sparql_dbpedia18", "template", "paraphrased_question"], "columns_mapping": {"NNQT_question": "NNQT_question", "uid": "uid", "subgraph": "subgraph", "template_index": "template_index", "question": "question", "sparql_wikidata": "sparql_wikidata", "sparql_dbpedia18": "sparql_dbpedia18", "template": "template", "paraphrased_question": "paraphrased_question"}, "dataset_description": "LC-QuAD 2.0 is a Large Question Answering dataset with 30,000 pairs of question and its corresponding SPARQL query. The target knowledge base is Wikidata and DBpedia, specifically the 2018 version. Please see our paper for details about the dataset creation process and framework.\n", "dataset_name": "lc_quad"}}, "tags": ["task_categories:question-answering", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base-qa"], "is_gated": false}, "lener_br": {"dataset_name": "lener_br", "description": "LeNER-Br is a Portuguese language dataset for named entity recognition\napplied to legal documents. LeNER-Br consists entirely of manually annotated\nlegislation and legal cases texts and contains tags for persons, locations,\ntime entities, organizations, legislation and legal cases.\nTo compose the dataset, 66 legal documents from several Brazilian Courts were\ncollected. Courts of superior and state levels were considered, such as Supremo\nTribunal Federal, Superior Tribunal de Justi\u00e7a, Tribunal de Justi\u00e7a de Minas\nGerais and Tribunal de Contas da Uni\u00e3o. In addition, four legislation documents\nwere collected, such as \"Lei Maria da Penha\", giving a total of 70 documents", "downloads": 383, "configs": {"lener_br": {"config_name": "lener_br", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EMENTA\\\", \\\":\\\", \\\"APELA\\\\u00c7\\\\u00c3O\\\", \\\"C\\\\u00cdVEL\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nLeNER-Br is a Portuguese language dataset for named entity recognition\napplied to legal documents. LeNER-Br consists entirely of manually annotated\nlegislation and legal cases texts and contains tags for persons, locations,\ntime entities, organizations, legislation and legal cases.\nTo compose the dataset, 66 legal documents from several Brazilian Courts were\ncollected. Courts of superior and state levels were considered, such as Supremo\nTribunal Federal, Superior Tribunal de Justi\u00e7a, Tribunal de Justi\u00e7a de Minas\nGerais and Tribunal de Contas da Uni\u00e3o. In addition, four legislation documents\nwere collected, such as \"Lei Maria da Penha\", giving a total of 70 documents\n", "dataset_name": "lener_br"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "legal"], "is_gated": false}, "liar": {"dataset_name": "liar", "description": "LIAR is a dataset for fake news detection with 12.8K human labeled short statements from politifact.com's API, and each statement is evaluated by a politifact.com editor for its truthfulness. The distribution of labels in the LIAR dataset is relatively well-balanced: except for 1,050 pants-fire cases, the instances for all other labels range from 2,063 to 2,638. In each case, the labeler provides a lengthy analysis report to ground each judgment.", "downloads": 1140, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"2635.json\\\"\", \"label\": \"0\", \"statement\": \"\\\"Says the Annies List political group supports thi...\", \"subject\": \"\\\"abortion\\\"\", \"speaker\": \"\\\"dwayne-bohac\\\"\", \"job_title\": \"\\\"State representative\\\"\", \"state_info\": \"\\\"Texas\\\"\", \"party_affiliation\": \"\\\"republican\\\"\", \"barely_true_counts\": \"0.0\", \"false_counts\": \"1.0\", \"half_true_counts\": \"0.0\", \"mostly_true_counts\": \"0.0\", \"pants_on_fire_counts\": \"0.0\", \"context\": \"\\\"a mailer\\\"\"}", "columns": ["id", "label", "statement", "subject", "speaker", "job_title", "state_info", "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"], "columns_mapping": {"id": "id", "label": "label", "statement": "statement", "subject": "subject", "speaker": "speaker", "job_title": "job_title", "state_info": "state_info", "party_affiliation": "party_affiliation", "barely_true_counts": "barely_true_counts", "false_counts": "false_counts", "half_true_counts": "half_true_counts", "mostly_true_counts": "mostly_true_counts", "pants_on_fire_counts": "pants_on_fire_counts", "context": "context"}, "dataset_description": "LIAR is a dataset for fake news detection with 12.8K human labeled short statements from politifact.com's API, and each statement is evaluated by a politifact.com editor for its truthfulness. The distribution of labels in the LIAR dataset is relatively well-balanced: except for 1,050 pants-fire cases, the instances for all other labels range from 2,063 to 2,638. In each case, the labeler provides a lengthy analysis report to ground each judgment.\n", "dataset_name": "liar"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "fake-news-detection"], "is_gated": false}, "librispeech_lm": {"dataset_name": "librispeech_lm", "description": "Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.", "downloads": 306, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"A\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.\n", "dataset_name": "librispeech_lm"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "limit": {"dataset_name": "limit", "description": "Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. Literal-Motion-in-Text (LiMiT) dataset, is a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion.", "downloads": 392, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"sentence\": \"\\\" A little boy holding a yellow ball walks by.\\\"\", \"motion\": \"\\\"yes\\\"\", \"motion_entities\": \"[{\\\"entity\\\": \\\"little boy\\\", \\\"start_index\\\": 2}, {\\\"ent...\"}", "columns": ["id", "sentence", "motion", "motion_entities"], "columns_mapping": {"id": "id", "sentence": "sentence", "motion": "motion", "motion_entities": "motion_entities"}, "dataset_description": "Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. Literal-Motion-in-Text (LiMiT) dataset, is a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion.\n", "dataset_name": "limit"}}, "tags": ["task_categories:token-classification", "task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|net-activities-captions", "source_datasets:original", "language:en"], "is_gated": false}, "linnaeus": {"dataset_name": "linnaeus", "description": "A novel corpus of full-text documents manually annotated for species mentions.", "downloads": 305, "configs": {"linnaeus": {"config_name": "linnaeus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Scp160p\\\", \\\",\\\", \\\"a\\\", \\\"multiple\\\", \\\"KH\\\", \\\"-\\\", \\\"doma...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "A novel corpus of full-text documents manually annotated for species mentions.\n", "dataset_name": "linnaeus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "lm1b": {"dataset_name": "lm1b", "description": "A benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.", "downloads": 1029, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"While athletes in different professions dealt wit...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "A benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.\n", "dataset_name": "lm1b"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "language:en"], "is_gated": false}, "mac_morpho": {"dataset_name": "mac_morpho", "description": "Mac-Morpho is a corpus of Brazilian Portuguese texts annotated with part-of-speech tags.\nIts first version was released in 2003 [1], and since then, two revisions have been made in order\nto improve the quality of the resource [2, 3].\nThe corpus is available for download split into train, development and test sections.\nThese are 76%, 4% and 20% of the corpus total, respectively (the reason for the unusual numbers\nis that the corpus was first split into 80%/20% train/test, and then 5% of the train section was\nset aside for development). This split was used in [3], and new POS tagging research with Mac-Morpho\nis encouraged to follow it in order to make consistent comparisons possible.\n\n\n[1] Alu\u00edsio, S., Pelizzoni, J., Marchi, A.R., de Oliveira, L., Manenti, R., Marquiaf\u00e1vel, V. 2003.\nAn account of the challenge of tagging a reference corpus for brazilian portuguese.\nIn: Proceedings of the 6th International Conference on Computational Processing of the Portuguese Language. PROPOR 2003\n\n[2] Fonseca, E.R., Rosa, J.L.G. 2013. Mac-morpho revisited: Towards robust part-of-speech.\nIn: Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology \u2013 STIL\n\n[3] Fonseca, E.R., Alu\u00edsio, Sandra Maria, Rosa, J.L.G. 2015.\nEvaluating word embeddings and a revised corpus for part-of-speech tagging in Portuguese.\nJournal of the Brazilian Computer Society.", "downloads": 286, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Jersei\\\", \\\"atinge\\\", \\\"m\\\\u00e9dia\\\", \\\"de\\\", \\\"Cr$\\\", \\\"1...\", \"pos_tags\": \"[14, 19, 14, 15, 22, 7, 14, 9, 14, 9, 3, 15, 3, 3,...\"}", "columns": ["id", "tokens", "pos_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags"}, "dataset_description": "\nMac-Morpho is a corpus of Brazilian Portuguese texts annotated with part-of-speech tags.\nIts first version was released in 2003 [1], and since then, two revisions have been made in order\nto improve the quality of the resource [2, 3].\nThe corpus is available for download split into train, development and test sections.\nThese are 76%, 4% and 20% of the corpus total, respectively (the reason for the unusual numbers\nis that the corpus was first split into 80%/20% train/test, and then 5% of the train section was\nset aside for development). This split was used in [3], and new POS tagging research with Mac-Morpho\nis encouraged to follow it in order to make consistent comparisons possible.\n\n\n[1] Alu\u00edsio, S., Pelizzoni, J., Marchi, A.R., de Oliveira, L., Manenti, R., Marquiaf\u00e1vel, V. 2003.\nAn account of the challenge of tagging a reference corpus for brazilian portuguese.\nIn: Proceedings of the 6th International Conference on Computational Processing of the Portuguese Language. PROPOR 2003\n\n[2] Fonseca, E.R., Rosa, J.L.G. 2013. Mac-morpho revisited: Towards robust part-of-speech.\nIn: Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology \u2013 STIL\n\n[3] Fonseca, E.R., Alu\u00edsio, Sandra Maria, Rosa, J.L.G. 2015.\nEvaluating word embeddings and a revised corpus for part-of-speech tagging in Portuguese.\nJournal of the Brazilian Computer Society.\n", "dataset_name": "mac_morpho"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "masakhaner": {"dataset_name": "masakhaner", "description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "downloads": 2048, "configs": {"amh": {"config_name": "amh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u1240\\\\u12f3\\\\u121a\\\\u12cd\\\", \\\"\\\\u12e8\\\\u1236\\\\u121b\\\\u...\", \"ner_tags\": \"[0, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "hau": {"config_name": "hau", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"A\\\", \\\"saurari\\\", \\\"cikakken\\\", \\\"rahoton\\\", \\\"wakilin\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 3, 4, 1, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "ibo": {"config_name": "ibo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ike\\\", \\\"\\\\u1ecbda\\\", \\\"j\\\\u1ee5\\\\u1ee5\\\", \\\"ot\\\\u1ee5\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "kin": {"config_name": "kin", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ambasaderi\\\", \\\"wa\\\", \\\"EU\\\", \\\"mu\\\", \\\"Rwanda\\\", \\\",\\\", \\\"N...\", \"ner_tags\": \"[0, 0, 3, 0, 5, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Empaka\\\", \\\"zaakubeera\\\", \\\"mu\\\", \\\"kibuga\\\", \\\"Liverpoo...\", \"ner_tags\": \"[0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 7, 8, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "luo": {"config_name": "luo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\ufeffJii\\\", \\\"2\\\", \\\"moko\\\", \\\"jowito\\\", \\\"ngimagi\\\", \\\"k...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "pcm": {"config_name": "pcm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mixed\\\", \\\"Martial\\\", \\\"Arts\\\", \\\"joinbodi\\\", \\\",\\\", \\\"Ult...\", \"ner_tags\": \"[3, 4, 4, 0, 0, 3, 4, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "swa": {"config_name": "swa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Wizara\\\", \\\"ya\\\", \\\"afya\\\", \\\"ya\\\", \\\"Tanzania\\\", \\\"imerip...\", \"ner_tags\": \"[3, 4, 4, 4, 4, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "wol": {"config_name": "wol", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"SAFIYETU\\\", \\\"B\\\\u00c9EY\\\", \\\"C\\\\u00e9y\\\", \\\"Koronaa\\\", \\\"...\", \"ner_tags\": \"[1, 2, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "yor": {"config_name": "yor", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"K\\\\u00f2\\\", \\\"s\\\\u00ed\\\", \\\"\\\\u1eb9\\\\u0300r\\\\u00ed\\\", \\\"t\\\\u...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:am", "language:ha", "language:ig", "language:lg", "language:luo", "language:pcm", "language:rw", "language:sw", "language:wo", "language:yo"], "is_gated": false}, "math_qa": {"dataset_name": "math_qa", "description": "Our dataset is gathered by using a new representation language to annotate over the AQuA-RAT dataset. AQuA-RAT has provided the questions, options, rationale, and the correct options.", "downloads": 31555, "configs": {"default": {"config_name": "default", "sample_row": "{\"Problem\": \"\\\"the banker ' s gain of a certain sum due 3 years ...\", \"Rationale\": \"\\\"\\\\\\\"explanation : t = 3 years r = 10 % td = ( bg \\\\u...\", \"options\": \"\\\"a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) ...\", \"correct\": \"\\\"a\\\"\", \"annotated_formula\": \"\\\"divide(multiply(const_100, divide(multiply(36, co...\", \"linear_formula\": \"\\\"multiply(n2,const_100)|multiply(n0,n1)|divide(#0,...\", \"category\": \"\\\"gain\\\"\"}", "columns": ["Problem", "Rationale", "options", "correct", "annotated_formula", "linear_formula", "category"], "columns_mapping": {"Problem": "Problem", "Rationale": "Rationale", "options": "options", "correct": "correct", "annotated_formula": "annotated_formula", "linear_formula": "linear_formula", "category": "category"}, "dataset_description": "\nOur dataset is gathered by using a new representation language to annotate over the AQuA-RAT dataset. AQuA-RAT has provided the questions, options, rationale, and the correct options.\n", "dataset_name": "math_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|aqua_rat", "language:en"], "is_gated": false}, "mbpp": {"dataset_name": "mbpp", "description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.", "downloads": 12392, "configs": {"full": {"config_name": "full", "sample_row": "{\"task_id\": \"601\", \"text\": \"\\\"Write a function to find the longest chain which ...\", \"code\": \"\\\"class Pair(object): \\\\r\\\\n\\\\tdef __init__(self, a, b...\", \"test_list\": \"[\\\"assert max_chain_length([Pair(5, 24), Pair(15, 2...\", \"test_setup_code\": \"\\\"\\\"\", \"challenge_test_list\": \"[]\"}", "columns": ["task_id", "text", "code", "test_list", "test_setup_code", "challenge_test_list"], "columns_mapping": {"task_id": "task_id", "text": "text", "code": "code", "test_list": "test_list", "test_setup_code": "test_setup_code", "challenge_test_list": "challenge_test_list"}, "dataset_description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.\n", "dataset_name": "mbpp"}, "sanitized": {"config_name": "sanitized", "sample_row": "{\"source_file\": \"\\\"Benchmark Questions Verification V2.ipynb\\\"\", \"task_id\": \"602\", \"prompt\": \"\\\"Write a python function to find the first repeate...\", \"code\": \"\\\"def first_repeated_char(str1):\\\\n for index,c in ...\", \"test_imports\": \"[]\", \"test_list\": \"[\\\"assert first_repeated_char(\\\\\\\"abcabc\\\\\\\") == \\\\\\\"a\\\\\\\"\\\"...\"}", "columns": ["source_file", "task_id", "prompt", "code", "test_imports", "test_list"], "columns_mapping": {"source_file": "source_file", "task_id": "task_id", "prompt": "prompt", "code": "code", "test_imports": "test_imports", "test_list": "test_list"}, "dataset_description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.\n", "dataset_name": "mbpp"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "code-generation"], "is_gated": false}, "mc4": {"dataset_name": "mc4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.", "downloads": 20390, "configs": {"af": {"config_name": "af", "sample_row": "{\"text\": \"\\\"Toe was daar nie plek vir telling teen Ikeys | Ne...\", \"timestamp\": \"\\\"2018-11-19T07:24:51Z\\\"\", \"url\": \"\\\"https://www.netwerk24.com/Sport/Rugby/toe-was-daa...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "am": {"config_name": "am", "sample_row": "{\"text\": \"\\\"\\\\u1260\\\\u1309\\\\u122d\\\\u121d\\\\u1235\\\\u1293 \\\\u12d5\\\\u12f5...\", \"timestamp\": \"\\\"2019-06-20T13:32:25Z\\\"\", \"url\": \"\\\"https://malvorlagen-seite.de/am/pubertaet-bei-jug...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ar": {"config_name": "ar", "sample_row": "{\"text\": \"\\\"\\\\\\\"\\\\u062e\\\\u0644\\\\u064a \\\\u0648\\\\u0631\\\\u0642\\\\u062a\\\\u06...\", \"timestamp\": \"\\\"2018-11-14T08:51:59Z\\\"\", \"url\": \"\\\"http://www.ghadinews.net/newsdet.aspx?id=8909&id2...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "az": {"config_name": "az", "sample_row": "{\"text\": \"\\\"M\\\\u00fchacir\\\\u0259t m\\\\u00f6vzusunun \\\\u00f6yr\\\\u025...\", \"timestamp\": \"\\\"2019-01-23T08:22:09Z\\\"\", \"url\": \"\\\"https://azertag.az/xeber/Muhaciret_movzusunun_oyr...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "be": {"config_name": "be", "sample_row": "{\"text\": \"\\\"\\\\u0410\\\\u0434\\\\u0437\\\\u0456\\\\u043d \\\\u043c\\\\u0430\\\\u043b...\", \"timestamp\": \"\\\"2019-02-20T00:21:49Z\\\"\", \"url\": \"\\\"http://uzv.by/adzin-malenki-uspamin-z-dzyacinstva...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bg": {"config_name": "bg", "sample_row": "{\"text\": \"\\\"\\\\u0410\\\\u043c\\\\u0435\\\\u0440\\\\u0438\\\\u043a\\\\u0430\\\\u043d\\\\...\", \"timestamp\": \"\\\"2020-05-31T09:29:33Z\\\"\", \"url\": \"\\\"http://www.spacenewsbg.com/news/29/April/2020/537...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bg-Latn": {"config_name": "bg-Latn", "sample_row": "{\"text\": \"\\\"Prezzi e Quotazioni Aggiornate 2017 Opel Astra-5-...\", \"timestamp\": \"\\\"2017-10-17T20:44:56Z\\\"\", \"url\": \"\\\"http://listino.infomotori.com/quotazione_usato/op...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bn": {"config_name": "bn", "sample_row": "{\"text\": \"\\\"\\\\u09a6\\\\u09cd\\\\u09ac\\\\u09bf\\\\u09a4\\\\u09c0\\\\u09df \\\\u09ae...\", \"timestamp\": \"\\\"2019-09-19T00:07:47Z\\\"\", \"url\": \"\\\"http://dailysylhet.com/details/419696\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ca": {"config_name": "ca", "sample_row": "{\"text\": \"\\\"Les croades by Oriol_ins_front_m... 999 views\\\\nVi...\", \"timestamp\": \"\\\"2019-07-20T05:40:39Z\\\"\", \"url\": \"\\\"https://www.slideshare.net/quarteso/el-reino-delo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ceb": {"config_name": "ceb", "sample_row": "{\"text\": \"\\\"Khet Rat Burana - Wikipedia\\\\nTiganos: 13\\\\u00b040\\\\...\", \"timestamp\": \"\\\"2020-08-07T05:31:55Z\\\"\", \"url\": \"\\\"https://ceb.wikipedia.org/wiki/Khet_Rat_Burana\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "co": {"config_name": "co", "sample_row": "{\"text\": \"\\\"Prima pagina FEMINA CLUB Vin rosu cu banane 27 Ju...\", \"timestamp\": \"\\\"2017-07-27T14:49:10Z\\\"\", \"url\": \"\\\"http://www.revistamagazin.ro/content/view/5247/5/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "cs": {"config_name": "cs", "sample_row": "{\"text\": \"\\\"Kempy & Soust\\\\u0159ed\\\\u011bn\\\\u00ed Aerobik klub O...\", \"timestamp\": \"\\\"2020-01-29T18:58:35Z\\\"\", \"url\": \"\\\"http://www.aerobikolomouc.cz/ak-olomouc/kempy-201...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "cy": {"config_name": "cy", "sample_row": "{\"text\": \"\\\"Red River: \\\\u039f\\\\u03c1\\\\u03b9\\\\u03c3\\\\u03bc\\\\u03ad\\\\u...\", \"timestamp\": \"\\\"2018-03-22T15:56:46Z\\\"\", \"url\": \"\\\"http://followtheredriver.blogspot.com/2012/06/blo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "da": {"config_name": "da", "sample_row": "{\"text\": \"\\\"Om\\\\u00f8 - Wikipedia's Om\\\\u00f8 as translated by ...\", \"timestamp\": \"\\\"2020-08-11T10:15:26Z\\\"\", \"url\": \"\\\"https://dan.wikitrans.net/Om%C3%B8\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "de": {"config_name": "de", "sample_row": "{\"text\": \"\\\"Home - Homepage des Kunstvereins Pro Ars Lausitz ...\", \"timestamp\": \"\\\"2018-01-20T18:56:35Z\\\"\", \"url\": \"\\\"http://proarslausitz.de/1.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "el": {"config_name": "el", "sample_row": "{\"text\": \"\\\"\\\\u03a4\\\\u03b1 \\\\u03ba\\\\u03b1\\\\u03bb\\\\u03cd\\\\u03c4\\\\u03b5...\", \"timestamp\": \"\\\"2017-07-21T19:11:04Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.com.gr/Restaurants-g61240...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "el-Latn": {"config_name": "el-Latn", "sample_row": "{\"text\": \"\\\"Art.No.: VB-200108-10-H\\\\nUrsula writes: 06.08.201...\", \"timestamp\": \"\\\"2019-08-19T12:28:32Z\\\"\", \"url\": \"\\\"https://www.vivobarefoot.de/en/ladies/vivobarefoo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"Posts 4,362\\\\tMore Info\\\\nOkay so to those of you t...\", \"timestamp\": \"\\\"2014-03-09T04:06:28Z\\\"\", \"url\": \"\\\"http://www.polkaudio.com/forums/showthread.php?58...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "eo": {"config_name": "eo", "sample_row": "{\"text\": \"\\\"Oberiu - Wikipedia's Oberiu as translated by Gram...\", \"timestamp\": \"\\\"2019-06-16T09:35:01Z\\\"\", \"url\": \"\\\"https://epo.wikitrans.net/Oberiu\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "es": {"config_name": "es", "sample_row": "{\"text\": \"\\\"Comprar Zapatillas para ni\\\\u00f1a en chancla con ...\", \"timestamp\": \"\\\"2019-01-18T17:11:30Z\\\"\", \"url\": \"\\\"https://www.calzadoslabalear.com/es/zapatillas-mu...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "et": {"config_name": "et", "sample_row": "{\"text\": \"\\\"EUROPAELi avatud andmete portaal Andmed Andmete a...\", \"timestamp\": \"\\\"2018-03-23T04:56:35Z\\\"\", \"url\": \"\\\"http://data.europa.eu/euodp/et/data/dataset/secto...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "eu": {"config_name": "eu", "sample_row": "{\"text\": \"\\\"Liverpool: The Beatles-en jaioterria eta ametsen ...\", \"timestamp\": \"\\\"2018-07-19T13:36:42Z\\\"\", \"url\": \"\\\"http://www.durangojesuitak.org/liverpool-the-beat...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fa": {"config_name": "fa", "sample_row": "{\"text\": \"\\\"\\\\u0642\\\\u06cc\\\\u0645\\\\u062a \\\\u062f\\\\u0648\\\\u0631\\\\u0628...\", \"timestamp\": \"\\\"2018-10-23T17:29:51Z\\\"\", \"url\": \"\\\"http://sib7.com/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fi": {"config_name": "fi", "sample_row": "{\"text\": \"\\\"Kontiolahti - Ihmisen pelastaminen - Kelkkailijat...\", \"timestamp\": \"\\\"2020-02-28T14:21:24Z\\\"\", \"url\": \"\\\"https://www.pkpelastuslaitos.fi/onnettomuustiedot...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fil": {"config_name": "fil", "sample_row": "{\"text\": \"\\\"\\\\ud83d\\\\ude00 Halimbawa ng thesis sa filipino tung...\", \"timestamp\": \"\\\"2019-04-19T17:14:36Z\\\"\", \"url\": \"\\\"http://talisman-intl.com/halimbawa-ng-thesis-sa-f...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"Le sacre de philippe ier, 23 mai 1059 - Compte Re...\", \"timestamp\": \"\\\"2017-12-15T04:37:34Z\\\"\", \"url\": \"\\\"http://www.etudier.com/dissertations/Le-Sacre-De-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fy": {"config_name": "fy", "sample_row": "{\"text\": \"\\\"Business Park Mas Blau II Place Pla de L\\\\u2019Est...\", \"timestamp\": \"\\\"2019-07-20T12:10:15Z\\\"\", \"url\": \"\\\"https://1worldirectory.com/28th-euro-global-neuro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ga": {"config_name": "ga", "sample_row": "{\"text\": \"\\\"Smaointe F\\\\u00e1nacha Aonghusa: Comhl\\\\u00e1n\\\\u00f...\", \"timestamp\": \"\\\"2018-01-18T18:03:55Z\\\"\", \"url\": \"\\\"https://aonghus.blogspot.com/2014/08/comhlanu-le-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gd": {"config_name": "gd", "sample_row": "{\"text\": \"\\\"Caol Reatha - Uicipeid\\\\nCo-chomharran: 57\\\\u00b013...\", \"timestamp\": \"\\\"2020-08-03T21:31:47Z\\\"\", \"url\": \"\\\"https://gd.m.wikipedia.org/wiki/Caol_Reatha\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gl": {"config_name": "gl", "sample_row": "{\"text\": \"\\\"Niza (San Sebasti\\\\u00e1n - Donostia, Espa\\\\u00f1a)...\", \"timestamp\": \"\\\"2017-09-25T14:11:41Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.es/VacationRentalReview-g...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gu": {"config_name": "gu", "sample_row": "{\"text\": \"\\\"\\\\u0aee\\\\u0ae6% \\\\u0aa8\\\\u0abf\\\\u0a95\\\\u0abe\\\\u0ab8\\\\u0a9...\", \"timestamp\": \"\\\"2018-12-10T08:21:40Z\\\"\", \"url\": \"\\\"http://sandesh.com/80-exporters-gst-refund-7-m/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ha": {"config_name": "ha", "sample_row": "{\"text\": \"\\\"Ma\\\\u0257aukaki Matsala, Magana, Magani bayyana: M...\", \"timestamp\": \"\\\"2019-07-22T01:13:39Z\\\"\", \"url\": \"\\\"https://www.martinvrijland.nl/ha/nazarin-labarai/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "haw": {"config_name": "haw", "sample_row": "{\"text\": \"\\\"houses Kampala - Fashion - Fashion Accessories - ...\", \"timestamp\": \"\\\"2018-06-23T23:17:39Z\\\"\", \"url\": \"\\\"https://www.afribaba.ug/ads/houses+Kampala.htm?ci...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hi": {"config_name": "hi", "sample_row": "{\"text\": \"\\\"6 \\\\u0938\\\\u093e\\\\u0932 \\\\u0915\\\\u0940 \\\\u092c\\\\u091a\\\\u0...\", \"timestamp\": \"\\\"2018-12-15T16:31:15Z\\\"\", \"url\": \"\\\"http://www.upuklive.com/2018/11/6_20.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hi-Latn": {"config_name": "hi-Latn", "sample_row": "{\"text\": \"\\\"Total de visitas: 24089\\\\nHindi Book Free Download...\", \"timestamp\": \"\\\"2019-08-26T01:02:16Z\\\"\", \"url\": \"\\\"http://thylmotopbtovict.comunidades.net/hindi-boo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hmn": {"config_name": "hmn", "sample_row": "{\"text\": \"\\\"Yuav Ua Li Cas Cov Nkag Tawm Ntawm Cov Pob Taws U...\", \"timestamp\": \"\\\"2020-04-02T03:42:06Z\\\"\", \"url\": \"\\\"https://hmn.phanthanhgianfoundation.com/how-to-re...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ht": {"config_name": "ht", "sample_row": "{\"text\": \"\\\"Gwo pouvwa 40W ki ap dirije lari lanp segond\\\\u00e...\", \"timestamp\": \"\\\"2020-06-06T17:00:07Z\\\"\", \"url\": \"\\\"https://www.ledlightinside.com/ht/sword-series-le...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hu": {"config_name": "hu", "sample_row": "{\"text\": \"\\\"Gy\\\\u00e1ri 5X112 7X17 ET54 57.1 HA1618 SEAT Gy\\\\u0...\", \"timestamp\": \"\\\"2020-07-06T09:13:31Z\\\"\", \"url\": \"\\\"https://weltgumi.hu/termek/imp_13_ha1618.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hy": {"config_name": "hy", "sample_row": "{\"text\": \"\\\"\\\\u0556\\\\u0580\\\\u0561\\\\u0576\\\\u057d\\\\u056b\\\\u0561\\\\u0575\\\\...\", \"timestamp\": \"\\\"2020-06-06T20:17:50Z\\\"\", \"url\": \"\\\"https://www.1in.am/2729354.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "id": {"config_name": "id", "sample_row": "{\"text\": \"\\\"thapki full serial | Cinta Sinopsis\\\\nHome \\\\u00bb ...\", \"timestamp\": \"\\\"2017-12-11T17:34:24Z\\\"\", \"url\": \"\\\"http://cintasinopsis2.com/search/thapki-full-seri...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ig": {"config_name": "ig", "sample_row": "{\"text\": \"\\\"Hoka One One Arahi 3 Women's Allure/Mood Indigo [...\", \"timestamp\": \"\\\"2020-07-02T18:27:05Z\\\"\", \"url\": \"\\\"https://www.hoka-shoes.com/hoka-one-one-arahi-3-w...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "is": {"config_name": "is", "sample_row": "{\"text\": \"\\\"Omegle Ifugao. Besta val Omegle Ifugao. Inn og ha...\", \"timestamp\": \"\\\"2017-07-22T16:44:11Z\\\"\", \"url\": \"\\\"http://is.theomegle.com/filippseyjar/ifugao\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "it": {"config_name": "it", "sample_row": "{\"text\": \"\\\"Porcate Da Fare Con Il Partner Video Flirt Online...\", \"timestamp\": \"\\\"2017-08-18T23:57:08Z\\\"\", \"url\": \"\\\"http://gerebe.eu/porcate-da-fare-con-il-partner-v...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "iw": {"config_name": "iw", "sample_row": "{\"text\": \"\\\"\\\\u05d6\\\\u05db\\\\u05d5\\\\u05ea \\\\u05d4\\\\u05e9\\\\u05d1\\\\u05d9...\", \"timestamp\": \"\\\"2019-03-18T19:44:40Z\\\"\", \"url\": \"\\\"https://www.yeshiva.org.il/midrash/14790\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ja": {"config_name": "ja", "sample_row": "{\"text\": \"\\\"\\\\u751f\\\\u516b\\\\u3064\\\\u6a4b\\\\u306e\\\\u30bf\\\\u30b0\\\\u307e\\\\...\", \"timestamp\": \"\\\"2020-05-27T07:31:25Z\\\"\", \"url\": \"\\\"https://www.exblog.jp/tag/keyword/%E7%94%9F%E5%85...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ja-Latn": {"config_name": "ja-Latn", "sample_row": "{\"text\": \"\\\"Yuria Ashina - Pics & Movies Galleries - Teenax\\\\n...\", \"timestamp\": \"\\\"2017-09-25T10:18:12Z\\\"\", \"url\": \"\\\"http://www.teenax.com/free/pics-movies/yuria/ashi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "jv": {"config_name": "jv", "sample_row": "{\"text\": \"\\\"Parcel Baby Born | IklanBarisMassal.com | sebar i...\", \"timestamp\": \"\\\"2018-11-15T23:20:00Z\\\"\", \"url\": \"\\\"http://iklanbarismassal.iklanbaris.org/tag/parcel...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ka": {"config_name": "ka", "sample_row": "{\"text\": \"\\\"\\\\u10e0\\\\u10e3\\\\u10e1\\\\u10d4\\\\u10d7\\\\u10d8\\\\u10e1 \\\\u10de...\", \"timestamp\": \"\\\"2018-07-17T16:07:58Z\\\"\", \"url\": \"\\\"https://www.radiotavisupleba.ge/a/rusetis-presis-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "kk": {"config_name": "kk", "sample_row": "{\"text\": \"\\\"\\\\u0422\\\\u0430\\\\u049b\\\\u044b\\\\u0440\\\\u044b\\\\u043f: \\\\u041...\", \"timestamp\": \"\\\"2017-11-21T08:13:58Z\\\"\", \"url\": \"\\\"http://www.tarbie.kz/1205\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "km": {"config_name": "km", "sample_row": "{\"text\": \"\\\"\\\\u1794\\\\u17d2\\\\u179b\\\\u17c2\\\\u1780\\\\u17d7 \\\\u17d6\\\\u1796...\", \"timestamp\": \"\\\"2019-02-16T12:30:53Z\\\"\", \"url\": \"\\\"http://youfeed.net/archives/27\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "kn": {"config_name": "kn", "sample_row": "{\"text\": \"\\\"\\\\u0cb8\\\\u0ca6\\\\u0ccd\\\\u0caf\\\\u0ca6\\\\u0cb2\\\\u0ccd\\\\u0cb2\\\\...\", \"timestamp\": \"\\\"2020-07-15T18:51:35Z\\\"\", \"url\": \"\\\"https://kannada.goodreturns.in/news/new-one-rupee...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ko": {"config_name": "ko", "sample_row": "{\"text\": \"\\\"\\\\uc6c0\\\\uc9e4 - 19 \\\\uc774\\\\uc0c1\\\\ub9cc | \\\\ub2e4\\\\uc6...\", \"timestamp\": \"\\\"2020-07-13T03:51:37Z\\\"\", \"url\": \"\\\"https://leesangman.com/tag/%EC%9B%80%EC%A7%A4/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ku": {"config_name": "ku", "sample_row": "{\"text\": \"\\\"\\\\ufffd\\\\u06b3\\\\ufffd\\\\u0177\\\\ufffd\\\\ufffd\\\\ufffd\\\\ufffd\\\\...\", \"timestamp\": \"\\\"2016-10-27T19:37:29Z\\\"\", \"url\": \"\\\"http://dl.rakuten.co.jp/prod/800822116.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ky": {"config_name": "ky", "sample_row": "{\"text\": \"\\\"\\\\u042f\\\\u043b\\\\u0433\\\\u044b\\\\u0448\\\\u043b\\\\u0430\\\\u0440\\\\...\", \"timestamp\": \"\\\"2019-07-19T03:43:32Z\\\"\", \"url\": \"\\\"http://atnya-rt.ru/news/mgyiyat/ialgislarni-buldi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "la": {"config_name": "la", "sample_row": "{\"text\": \"\\\"OUDDORP - Huisartsenpraktijk Kop van 't Eiland | ...\", \"timestamp\": \"\\\"2019-08-18T07:52:35Z\\\"\", \"url\": \"\\\"https://www.vanderschootarchitecten.nl/projecten/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lb": {"config_name": "lb", "sample_row": "{\"text\": \"\\\"Truck Driving Jobs At Decker Truck Line | TruckDr...\", \"timestamp\": \"\\\"2019-11-21T09:17:59Z\\\"\", \"url\": \"\\\"https://truckdriverjobsingreatfallsmt.com/truck-d...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lo": {"config_name": "lo", "sample_row": "{\"text\": \"\\\"\\\\u0e99\\\\u0eb3\\\\u200b\\\\u0e9e\\\\u0ea3\\\\u0eb0\\\\u200b\\\\u0e81\\\\...\", \"timestamp\": \"\\\"2014-03-16T04:29:14Z\\\"\", \"url\": \"\\\"https://www.lds.org/general-conference/2013/04/th...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lt": {"config_name": "lt", "sample_row": "{\"text\": \"\\\"VALERIJA VILKAUSKIEN\\\\u0116\\\\nGid\\\\u0117 VALERIJA VI...\", \"timestamp\": \"\\\"2017-06-29T12:32:45Z\\\"\", \"url\": \"\\\"http://turizmokatalogas.lt/gidas/vilkauskiene-val...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lv": {"config_name": "lv", "sample_row": "{\"text\": \"\\\"R\\\\u012bg\\\\u0101 2004. gada 26. mart\\\\u0101\\\\napstipr...\", \"timestamp\": \"\\\"2019-12-08T00:39:13Z\\\"\", \"url\": \"\\\"https://likumi.lv/doc.php?id=86335\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mg": {"config_name": "mg", "sample_row": "{\"text\": \"\\\"Find the best CPA or Tax Accountant in Papaaloa, ...\", \"timestamp\": \"\\\"2016-12-09T19:15:55Z\\\"\", \"url\": \"\\\"http://www.taxbuzz.com/find-the-best-tax-accounta...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mi": {"config_name": "mi", "sample_row": "{\"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0442\\\\u043e\\\\u043d\\\\u0438, \\\\u0441\\\\u044...\", \"timestamp\": \"\\\"2017-07-20T16:43:12Z\\\"\", \"url\": \"\\\"https://ukrreferat.com/chapters/avtoref/betoni-st...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mk": {"config_name": "mk", "sample_row": "{\"text\": \"\\\"\\\\u0422\\\\u0420\\\\u0413\\\\u041d\\\\u0410\\\\u0410 \\\\u0421\\\\u041e...\", \"timestamp\": \"\\\"2018-11-20T14:56:13Z\\\"\", \"url\": \"\\\"http://sport.com.mk/megjunaroden-fudbal/uefa-evro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ml": {"config_name": "ml", "sample_row": "{\"text\": \"\\\"\\\\u0d12\\\\u0d30\\\\u0d41 Ketogenic \\\\u0d21\\\\u0d2f\\\\u0d31\\\\u...\", \"timestamp\": \"\\\"2020-08-15T04:29:47Z\\\"\", \"url\": \"\\\"https://ml.elpasobackclinic.com/what-is-a-ketogen...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mn": {"config_name": "mn", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0440\\\\u0442\\\\t25 \\\\u041d\\\\u043e\\\\u...\", \"timestamp\": \"\\\"2020-06-03T22:37:43Z\\\"\", \"url\": \"\\\"https://vtinform.com/news/145/150979/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mr": {"config_name": "mr", "sample_row": "{\"text\": \"\\\"uedbet\\\\u7b2c\\\\u4e94\\\\u5341\\\\u4e5d\\\\u7ae0 \\\\u5c01\\\\u4faf...\", \"timestamp\": \"\\\"2019-10-16T04:53:12Z\\\"\", \"url\": \"\\\"http://www.oybx.cn/ddk1597/1662712.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ms": {"config_name": "ms", "sample_row": "{\"text\": \"\\\"Suzana Mustafa: Bunga Ros Camellia\\\\nBunga Ros Cam...\", \"timestamp\": \"\\\"2018-09-19T21:23:16Z\\\"\", \"url\": \"\\\"http://diariann.blogspot.com/2012/05/bunga-ros-ca...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mt": {"config_name": "mt", "sample_row": "{\"text\": \"\\\"Tastaturi \\\\u00een Iasi - OLX.ro\\\\nAnunturi Iasi - ...\", \"timestamp\": \"\\\"2017-12-16T15:20:24Z\\\"\", \"url\": \"\\\"https://www.olx.ro/iasi_39939/q-tastaturi/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "my": {"config_name": "my", "sample_row": "{\"text\": \"\\\"\\\\u1042\\\\u1040\\\\u1041\\\\u1040 \\\\u1001\\\\u102f\\\\u108f\\\\u103d...\", \"timestamp\": \"\\\"2018-08-19T11:14:40Z\\\"\", \"url\": \"\\\"http://thevoicemyanmar.com/about-us/18711-lkl\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ne": {"config_name": "ne", "sample_row": "{\"text\": \"\\\"\\\\u092a\\\\u094b\\\\u0930\\\\u094d\\\\u091a\\\\u0941\\\\u0917\\\\u0932\\\\...\", \"timestamp\": \"\\\"2019-04-22T00:04:13Z\\\"\", \"url\": \"\\\"http://vishwanews.com/Articles/view/4012\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "nl": {"config_name": "nl", "sample_row": "{\"text\": \"\\\"Vijf gouden tips voor succesvol zaken doen met Ja...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "no": {"config_name": "no", "sample_row": "{\"text\": \"\\\"Alf-tande petersen: - Jeg klarte ikke \\\\u00e5 beve...\", \"timestamp\": \"\\\"2020-06-03T22:36:11Z\\\"\", \"url\": \"\\\"https://www.seher.no/kjendis/jeg-klarte-ikke-a-be...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ny": {"config_name": "ny", "sample_row": "{\"text\": \"\\\"Date Latino Women In Ikawa, Shizuoka - Chat To La...\", \"timestamp\": \"\\\"2020-03-29T17:45:30Z\\\"\", \"url\": \"\\\"https://www.afroromance.com/members/Japan/Shizuok...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pa": {"config_name": "pa", "sample_row": "{\"text\": \"\\\"\\\\u0a07\\\\u0a38\\\\u0a32\\\\u0a3e\\\\u0a2e\\\\u0a3e\\\\u0a2c\\\\u0a3e\\\\...\", \"timestamp\": \"\\\"2020-08-06T13:06:11Z\\\"\", \"url\": \"\\\"https://jagbani.punjabkesari.in/international/new...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pl": {"config_name": "pl", "sample_row": "{\"text\": \"\\\"Author: Dorothy Celeste\\\\nISBN: 779-8-61280-301-9\\\\...\", \"timestamp\": \"\\\"2018-06-20T22:33:57Z\\\"\", \"url\": \"\\\"http://downloadallstuffs.club/best/gran-canaria-p...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ps": {"config_name": "ps", "sample_row": "{\"text\": \"\\\"\\\\u062f \\\\u0648\\\\u0644\\\\u0633\\\\u0645\\\\u0634\\\\u0631 \\\\u062...\", \"timestamp\": \"\\\"2018-06-25T12:00:13Z\\\"\", \"url\": \"\\\"https://kabull.com/%D8%AF-%D9%88%D9%84%D8%B3%D9%8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pt": {"config_name": "pt", "sample_row": "{\"text\": \"\\\"Nova atra\\\\u00e7\\\\u00e3o de corredeiras do Sea Worl...\", \"timestamp\": \"\\\"2017-12-12T02:42:12Z\\\"\", \"url\": \"\\\"http://malucasepiradas.com.br/orlando/infinityfal...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ro": {"config_name": "ro", "sample_row": "{\"text\": \"\\\"Download Darone Feat. Amanda Wilson - Believe In ...\", \"timestamp\": \"\\\"2017-01-22T18:13:35Z\\\"\", \"url\": \"\\\"http://www.muzicanet.net/descarca-romaneasca/Daro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ru": {"config_name": "ru", "sample_row": "{\"text\": \"\\\"\\\\u2714\\\\ud83d\\\\udc4d\\\\ud83c\\\\udfff \\\\u041a\\\\u0443\\\\u043f...\", \"timestamp\": \"\\\"2017-09-23T16:16:52Z\\\"\", \"url\": \"\\\"https://needhack.ru/zakazat-organic-mask-v-novosi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ru-Latn": {"config_name": "ru-Latn", "sample_row": "{\"text\": \"\\\"Page 1977 of 3320.\\\\n39521 of 66398. 51471-Issledo...\", \"timestamp\": \"\\\"2017-08-20T23:15:02Z\\\"\", \"url\": \"\\\"http://writer5.ru/prompter/page1976.php\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sd": {"config_name": "sd", "sample_row": "{\"text\": \"\\\"\\\\u062f\\\\u0645\\\\u0627\\\\u0633\\\\u0646\\\\u062c \\\\u0648 \\\\u063...\", \"timestamp\": \"\\\"2019-09-20T12:23:42Z\\\"\", \"url\": \"\\\"https://rabinseh.com/%D8%AF%D9%85%D8%A7%D8%B3%D9%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "si": {"config_name": "si", "sample_row": "{\"text\": \"\\\"\\\\u0d85\\\\u0db1\\\\u0dd4\\\\u0dc4\\\\u0dc3\\\\u0dca \\\\u0dbd\\\\u0db6...\", \"timestamp\": \"\\\"2018-06-22T17:19:49Z\\\"\", \"url\": \"\\\"http://tharunie.lk/component/k2/item/2907-%E0%B6%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sk": {"config_name": "sk", "sample_row": "{\"text\": \"\\\"V\\\\u00fdsledok vyh\\\\u013ead\\\\u00e1vania pre \\\\u201e\\\\u...\", \"timestamp\": \"\\\"2020-02-25T17:24:35Z\\\"\", \"url\": \"\\\"https://ladasvetom.dennikn.sk/page/4/?s\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sl": {"config_name": "sl", "sample_row": "{\"text\": \"\\\"Zakon o pomorski in notranji plovbi /ZPNP/\\\\nZakon...\", \"timestamp\": \"\\\"2013-05-25T15:20:01Z\\\"\", \"url\": \"\\\"http://zakonodaja.gov.si/rpsi/r08/predpis_ZAKO121...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sm": {"config_name": "sm", "sample_row": "{\"text\": \"\\\"Samoa Observer | Manatu O Le Fa\\\\u2019atonu - O se...\", \"timestamp\": \"\\\"2020-08-10T16:08:09Z\\\"\", \"url\": \"\\\"https://www.samoaobserver.ws/category/article/244...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sn": {"config_name": "sn", "sample_row": "{\"text\": \"\\\"Heren Nike Air Max 95 Blauw Wit Schoenen Online,n...\", \"timestamp\": \"\\\"2017-12-14T12:58:56Z\\\"\", \"url\": \"\\\"http://www.pensacolamower.com/heren-nike-air-max-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "so": {"config_name": "so", "sample_row": "{\"text\": \"\\\"Kiiskii labaad ee fayruska coronavirus oo laga he...\", \"timestamp\": \"\\\"2020-03-31T19:09:40Z\\\"\", \"url\": \"\\\"http://puntlandmirror.net/kiiskii-labaad-ee-fayru...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sq": {"config_name": "sq", "sample_row": "{\"text\": \"\\\"Arritja e grupit t\\\\u00eb par\\\\u00eb t\\\\u00eb migran...\", \"timestamp\": \"\\\"2017-08-24T01:22:38Z\\\"\", \"url\": \"\\\"https://www.evropaelire.org/a/27652834.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sr": {"config_name": "sr", "sample_row": "{\"text\": \"\\\"\\\\ufeff \\\\u041d\\\\u0430\\\\u0442\\\\u0438\\\\u043e\\\\u043d\\\\u0430...\", \"timestamp\": \"\\\"2020-04-06T07:41:55Z\\\"\", \"url\": \"\\\"https://sr.time4invest.com/life-benefits-of-hikin...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "st": {"config_name": "st", "sample_row": "{\"text\": \"\\\"LES COMBATTANTS DE PARIS TOUJOURS EN FORCE POUR L...\", \"timestamp\": \"\\\"2018-03-24T04:14:21Z\\\"\", \"url\": \"\\\"http://drigombaki.skyrock.com/3038956371-LES-COMB...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "su": {"config_name": "su", "sample_row": "{\"text\": \"\\\"abditrass aplikator Oktober 10, 2019 New Google S...\", \"timestamp\": \"\\\"2019-10-24T05:40:49Z\\\"\", \"url\": \"\\\"http://www.abditrass.com/2019/10/jasa-geolistrik-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sv": {"config_name": "sv", "sample_row": "{\"text\": \"\\\"Zara's Custom Tailor (Pattaya, Thailand) - omd\\\\u0...\", \"timestamp\": \"\\\"2018-11-14T05:15:44Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.se/Attraction_Review-g293...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sw": {"config_name": "sw", "sample_row": "{\"text\": \"\\\"2016 - 75 Miaka Meiringen Air Base - AviaSpotter....\", \"timestamp\": \"\\\"2019-10-16T05:11:38Z\\\"\", \"url\": \"\\\"https://www.aviaspotter.it/75-jahre-militarflugpl...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ta": {"config_name": "ta", "sample_row": "{\"text\": \"\\\"\\\\u0b95\\\\u0bc1\\\\u0bb4\\\\u0ba8\\\\u0bcd\\\\u0ba4\\\\u0bc8 \\\\u0baa...\", \"timestamp\": \"\\\"2020-07-06T20:32:10Z\\\"\", \"url\": \"\\\"http://www.thinakaran.lk/2019/11/16/%E0%AE%95%E0%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "te": {"config_name": "te", "sample_row": "{\"text\": \"\\\"\\\\u0c2e\\\\u0c3f\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c4b\\\\u0c2e\\\\u0c3e\\\\...\", \"timestamp\": \"\\\"2018-11-17T19:38:19Z\\\"\", \"url\": \"\\\"https://www.pricedekho.com/te/tablets/micromax-fu...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "tg": {"config_name": "tg", "sample_row": "{\"text\": \"\\\"\\\\u0412\\\\u0430\\\\u0437\\\\u0438\\\\u0440\\\\u0438 \\\\u043a\\\\u043e...\", \"timestamp\": \"\\\"2019-11-13T12:05:58Z\\\"\", \"url\": \"\\\"https://www.ozodi.org/a/609049.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "th": {"config_name": "th", "sample_row": "{\"text\": \"\\\"\\\\u0e1d\\\\u0e32\\\\u0e01\\\\u0e40\\\\u0e07\\\\u0e34\\\\u0e19 \\\\u0e01...\", \"timestamp\": \"\\\"2019-06-26T03:50:41Z\\\"\", \"url\": \"\\\"http://luatthanhnien.com/%E0%B8%81%E0%B8%B2%E0%B8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "tr": {"config_name": "tr", "sample_row": "{\"text\": \"\\\"Herhangi bir konuda \\\\u015feyhini aldatmamal\\\\u0131...\", \"timestamp\": \"\\\"2018-10-19T12:18:48Z\\\"\", \"url\": \"\\\"http://kalb-iselim.net/component/content/article/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "uk": {"config_name": "uk", "sample_row": "{\"text\": \"\\\"\\\\u042f\\\\u043a \\\\u043e\\\\u0431\\\\u043c\\\\u0435\\\\u0436\\\\u0438...\", \"timestamp\": \"\\\"2017-09-22T06:25:33Z\\\"\", \"url\": \"\\\"http://vidpoviday.com/yak-obmezhiti-shvidkist-int...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "und": {"config_name": "und", "sample_row": "{\"text\": \"\\\"Semi-Detached House for Sale - [40x70] 3200sqft 2...\", \"timestamp\": \"\\\"2019-12-09T20:49:49Z\\\"\", \"url\": \"\\\"http://www.hweeprop.com/25309733\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ur": {"config_name": "ur", "sample_row": "{\"text\": \"\\\"\\\\u0645\\\\u0641\\\\u062a\\\\u06cc \\\\u0645\\\\u062d\\\\u0645\\\\u062f...\", \"timestamp\": \"\\\"2017-09-19T11:38:03Z\\\"\", \"url\": \"\\\"http://www.geourdu.com/mufti-mohammad-naeem-pakis...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "uz": {"config_name": "uz", "sample_row": "{\"text\": \"\\\"Q056 Al-Waqiah - Qaari Usman Birnin Kebbi | dawah...\", \"timestamp\": \"\\\"2020-02-21T16:40:29Z\\\"\", \"url\": \"\\\"https://dawahnigeria.com/dawahcast/l/140302\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "vi": {"config_name": "vi", "sample_row": "{\"text\": \"\\\"Th\\\\u1ee9 hai, 24/12/2018, 07:53 (GMT+7)\\\\nPh\\\\u1ea1...\", \"timestamp\": \"\\\"2020-07-15T00:59:05Z\\\"\", \"url\": \"\\\"https://ndh.vn/vi-mo/kinh-te-xa-hoi-2018-nhung-du...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "xh": {"config_name": "xh", "sample_row": "{\"text\": \"\\\"Iindlela Zakuqala Awayeshumayela Ngazo AmaNgqina ...\", \"timestamp\": \"\\\"2017-11-22T13:53:24Z\\\"\", \"url\": \"\\\"https://www.jw.org/xh/iimpapasho/iincwadi/Incwadi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "yi": {"config_name": "yi", "sample_row": "{\"text\": \"\\\"\\\\u05d2\\\\u05d5\\\\u05d8\\\\u05e7\\\\u05e1 VOGUE \\\\u05ea\\\\u05d7...\", \"timestamp\": \"\\\"2020-07-06T16:45:27Z\\\"\", \"url\": \"\\\"https://gottex.co.il/collections/gottex/products/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "yo": {"config_name": "yo", "sample_row": "{\"text\": \"\\\"Linda Merrin | The Jewish Week\\\\nSearch this site:...\", \"timestamp\": \"\\\"2016-07-25T10:48:02Z\\\"\", \"url\": \"\\\"http://www.thejewishweek.com/category/person/lind...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zh": {"config_name": "zh", "sample_row": "{\"text\": \"\\\"\\\\u9518\\\\ufffd \\\\u6d5c\\\\ufffd\\\\ufffd\\\\ufffd88\\\\u6d93\\\\uff...\", \"timestamp\": \"\\\"2020-01-29T07:21:51Z\\\"\", \"url\": \"\\\"https://311016.cn/safe/2019/1025/14480.htm\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zh-Latn": {"config_name": "zh-Latn", "sample_row": "{\"text\": \"\\\"Search results for author \\\\\\\"Pan, Y.\\\\\\\"\\\\nSectoral a...\", \"timestamp\": \"\\\"2013-05-21T22:30:14Z\\\"\", \"url\": \"\\\"http://www.cifor.org/online-library/search/public...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zu": {"config_name": "zu", "sample_row": "{\"text\": \"\\\"Battle Hardened \\\\u00b7 TheJournal.ie\\\\n#battle har...\", \"timestamp\": \"\\\"2019-08-18T18:56:56Z\\\"\", \"url\": \"\\\"https://www.thejournal.ie/battle-hardened/news/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:ar", "language:az", "language:be", "language:bg", "language:bn", "language:ca", "language:ceb", "language:co", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fil", "language:fr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gu", "language:ha", "language:haw", "language:he", "language:hi", "language:hmn", "language:ht", "language:hu", "language:hy", "language:id", "language:ig", "language:is", "language:it", "language:iw", "language:ja", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lb", "language:lo", "language:lt", "language:lv", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:my", "language:ne", "language:nl", "language:no", "language:ny", "language:pa", "language:pl", "language:ps", "language:pt", "language:ro", "language:ru", "language:sd", "language:si", "language:sk", "language:sl", "language:sm", "language:sn", "language:so", "language:sq", "language:sr", "language:st", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:tg", "language:th", "language:tr", "language:uk", "language:und", "language:ur", "language:uz", "language:vi", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "med_hop": {"dataset_name": "med_hop", "description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.", "downloads": 507, "configs": {"original": {"config_name": "original", "sample_row": "{\"id\": \"\\\"MH_train_0\\\"\", \"question\": \"\\\"interacts_with DB00773?\\\"\", \"answer\": \"\\\"DB00072\\\"\", \"candidates\": \"[\\\"DB00072\\\", \\\"DB00294\\\", \\\"DB00338\\\", \\\"DB00341\\\", \\\"DB00...\", \"supports\": \"[\\\"Induction of apoptosis of Beta cells of the panc...\"}", "columns": ["id", "question", "answer", "candidates", "supports"], "columns_mapping": {"id": "id", "question": "question", "answer": "answer", "candidates": "candidates", "supports": "supports"}, "dataset_description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.\n", "dataset_name": "med_hop"}, "masked": {"config_name": "masked", "sample_row": "{\"id\": \"\\\"MH_train_0\\\"\", \"question\": \"\\\"interacts_with DB00773?\\\"\", \"answer\": \"\\\"___MASK51___\\\"\", \"candidates\": \"[\\\"___MASK10___\\\", \\\"___MASK16___\\\", \\\"___MASK2___\\\", \\\"_...\", \"supports\": \"[\\\"Induction of apoptosis of Beta cells of the panc...\"}", "columns": ["id", "question", "answer", "candidates", "supports"], "columns_mapping": {"id": "id", "question": "question", "answer": "answer", "candidates": "candidates", "supports": "supports"}, "dataset_description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.\n", "dataset_name": "med_hop"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "multi-hop"], "is_gated": false}, "medal": {"dataset_name": "medal", "description": "A large medical text dataset (14Go) curated to 4Go for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. For example, DHF can be disambiguated to dihydrofolate, diastolic heart failure, dengue hemorragic fever or dihydroxyfumarate", "downloads": 382, "configs": {"default": {"config_name": "default", "sample_row": "{\"abstract_id\": \"14145090\", \"text\": \"\\\"velvet antlers vas are commonly used in tradition...\", \"location\": \"[63]\", \"label\": \"[\\\"transverse aortic constriction\\\"]\"}", "columns": ["abstract_id", "text", "location", "label"], "columns_mapping": {"abstract_id": "abstract_id", "text": "text", "location": "location", "label": "label"}, "dataset_description": "A large medical text dataset (14Go) curated to 4Go for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. For example, DHF can be disambiguated to dihydrofolate, diastolic heart failure, dengue hemorragic fever or dihydroxyfumarate\n", "dataset_name": "medal"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "disambiguation"], "is_gated": false}, "medical_questions_pairs": {"dataset_name": "medical_questions_pairs", "description": "This dataset consists of 3048 similar and dissimilar medical question pairs hand-generated and labeled by Curai's doctors.", "downloads": 5754, "configs": {"default": {"config_name": "default", "sample_row": "{\"dr_id\": \"1\", \"question_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"question_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"1\"}", "columns": ["dr_id", "question_1", "question_2", "label"], "columns_mapping": {"dr_id": "dr_id", "question_1": "question_1", "question_2": "question_2", "label": "label"}, "dataset_description": "This dataset consists of 3048 similar and dissimilar medical question pairs hand-generated and labeled by Curai's doctors.\n", "dataset_name": "medical_questions_pairs"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "menyo20k_mt": {"dataset_name": "menyo20k_mt", "description": "MENYO-20k is a multi-domain parallel dataset with texts obtained from news articles, ted talks, movie transcripts, radio transcripts, science and technology texts, and other short articles curated from the web and professional translators. The dataset has 20,100 parallel sentences split into 10,070 training sentences, 3,397 development sentences, and 6,633 test sentences (3,419 multi-domain, 1,714 news domain, and 1,500 ted talks speech transcript domain). The development and test sets are available upon request.", "downloads": 291, "configs": {"menyo20k_mt": {"config_name": "menyo20k_mt", "sample_row": "{\"translation.en\": \"\\\"Unit 1: What is Creative Commons?\\\"\", \"translation.yo\": \"\\\"\\\\ufeff\\\\u00ccd\\\\u00e1 1: K\\\\u00edn ni Creative Commo...\"}", "columns": ["translation_en", "translation_yo"], "columns_mapping": {"translation.en": "translation_en", "translation.yo": "translation_yo"}, "dataset_description": "MENYO-20k is a multi-domain parallel dataset with texts obtained from news articles, ted talks, movie transcripts, radio transcripts, science and technology texts, and other short articles curated from the web and professional translators. The dataset has 20,100 parallel sentences split into 10,070 training sentences, 3,397 development sentences, and 6,633 test sentences (3,419 multi-domain, 1,714 news domain, and 1,500 ted talks speech transcript domain). The development and test sets are available upon request.\n", "dataset_name": "menyo20k_mt"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:yo"], "is_gated": false}, "meta_woz": {"dataset_name": "meta_woz", "description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.", "downloads": 549, "configs": {"dialogues": {"config_name": "dialogues", "sample_row": "{\"id\": \"\\\"c399a493\\\"\", \"user_id\": \"\\\"c05f0462\\\"\", \"bot_id\": \"\\\"c96edf42\\\"\", \"domain\": \"\\\"AGREEMENT_BOT\\\"\", \"task_id\": \"\\\"a9203a2c\\\"\", \"turns\": \"[\\\"Hello how may I help you?\\\", \\\"i am awesome\\\", \\\"of ...\"}", "columns": ["id", "user_id", "bot_id", "domain", "task_id", "turns"], "columns_mapping": {"id": "id", "user_id": "user_id", "bot_id": "bot_id", "domain": "domain", "task_id": "task_id", "turns": "turns"}, "dataset_description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.\n", "dataset_name": "meta_woz"}, "tasks": {"config_name": "tasks", "sample_row": "{\"task_id\": \"\\\"4a06139e\\\"\", \"domain\": \"\\\"UPDATE_CALENDAR\\\"\", \"bot_prompt\": \"\\\"Schedule the user's meeting request\\\"\", \"bot_role\": \"\\\"You are a bot designed to help schedule meetings ...\", \"user_prompt\": \"\\\" You have a meeting saved for March 24th. Ask the...\", \"user_role\": \"\\\"You are interacting with a meeting scheduling bot...\"}", "columns": ["task_id", "domain", "bot_prompt", "bot_role", "user_prompt", "user_role"], "columns_mapping": {"task_id": "task_id", "domain": "domain", "bot_prompt": "bot_prompt", "bot_role": "bot_role", "user_prompt": "user_prompt", "user_role": "user_role"}, "dataset_description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.\n", "dataset_name": "meta_woz"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "metooma": {"dataset_name": "metooma", "description": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.\nDue to Twitter's development policies, we only provide the tweet ID's and corresponding labels,\nother data can be fetched via Twitter API.\nThe data has been labelled by experts, with the majority taken into the account for deciding the final label.\nWe provide these labels for each of the tweets. The labels provided for each data point\nincludes -- Relevance, Directed Hate, Generalized Hate,\nSarcasm, Allegation, Justification, Refutation, Support, Oppose", "downloads": 329, "configs": {"default": {"config_name": "default", "sample_row": "{\"TweetId\": \"\\\"1052237153789390853\\\"\", \"Text_Only_Informative\": \"1\", \"Image_Only_Informative\": \"1\", \"Directed_Hate\": \"0\", \"Generalized_Hate\": \"0\", \"Sarcasm\": \"0\", \"Allegation\": \"0\", \"Justification\": \"1\", \"Refutation\": \"0\", \"Support\": \"1\", \"Oppose\": \"0\"}", "columns": ["TweetId", "Text_Only_Informative", "Image_Only_Informative", "Directed_Hate", "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", "Support", "Oppose"], "columns_mapping": {"TweetId": "TweetId", "Text_Only_Informative": "Text_Only_Informative", "Image_Only_Informative": "Image_Only_Informative", "Directed_Hate": "Directed_Hate", "Generalized_Hate": "Generalized_Hate", "Sarcasm": "Sarcasm", "Allegation": "Allegation", "Justification": "Justification", "Refutation": "Refutation", "Support": "Support", "Oppose": "Oppose"}, "dataset_description": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.\nDue to Twitter's development policies, we only provide the tweet ID's and corresponding labels,\nother data can be fetched via Twitter API.\nThe data has been labelled by experts, with the majority taken into the account for deciding the final label.\nWe provide these labels for each of the tweets. The labels provided for each data point\nincludes -- Relevance, Directed Hate, Generalized Hate,\nSarcasm, Allegation, Justification, Refutation, Support, Oppose\n", "dataset_name": "metooma"}}, "tags": ["task_categories:text-classification", "task_categories:text-retrieval", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "miam": {"dataset_name": "miam", "description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.", "downloads": 1070, "configs": {"dihana": {"config_name": "dihana", "sample_row": "{\"Speaker\": \"\\\"M\\\"\", \"Utterance\": \"\\\"Bienvenido al servicio de informaci\\\\u00f3n de tre...\", \"Dialogue_Act\": \"\\\"Apertura\\\"\", \"Dialogue_ID\": \"\\\"1\\\"\", \"File_ID\": \"\\\"B209_BB2a0\\\"\", \"Label\": \"1\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "ilisten": {"config_name": "ilisten", "sample_row": "{\"Speaker\": \"\\\"S_29_S1\\\"\", \"Utterance\": \"\\\"Ciao, il mio nome e' Valentina. Sono qui per dart...\", \"Dialogue_Act\": \"\\\"OPENING\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"Label\": \"8\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "loria": {"config_name": "loria", "sample_row": "{\"Speaker\": \"\\\"Lucas\\\"\", \"Utterance\": \"\\\"Alors!\\\"\", \"Dialogue_Act\": \"\\\"greet\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"File_ID\": \"\\\"Dial_20110615_105040\\\"\", \"Label\": \"5\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "maptask": {"config_name": "maptask", "sample_row": "{\"Speaker\": \"\\\"g\\\"\", \"Utterance\": \"\\\"okay the start part is at the top left-hand corne...\", \"Dialogue_Act\": \"\\\"instruct\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"File_ID\": \"\\\"q7nc7\\\"\", \"Label\": \"5\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "vm2": {"config_name": "vm2", "sample_row": "{\"Utterance\": \"\\\"mein Name ist Keller $K $E Doppel-$L $E $R\\\"\", \"Dialogue_Act\": \"\\\"INTRODUCE\\\"\", \"Speaker\": \"\\\"A\\\"\", \"Dialogue_ID\": \"\\\"1\\\"\", \"Label\": \"19\", \"Idx\": \"0\"}", "columns": ["Utterance", "Dialogue_Act", "Speaker", "Dialogue_ID", "Label", "Idx"], "columns_mapping": {"Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Speaker": "Speaker", "Dialogue_ID": "Dialogue_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-classification", "task_ids:dialogue-modeling", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:it", "dialogue-act-classification"], "is_gated": false}, "mlsum": {"dataset_name": "mlsum", "description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.", "downloads": 2326, "configs": {"de": {"config_name": "de", "sample_row": "{\"text\": \"\\\"Transport im Viehwaggon, Fleischgeruch in der Luf...\", \"summary\": \"\\\"Transport im Viehwaggon, Fleischgeruch in der Luf...\", \"topic\": \"\\\"politik\\\"\", \"url\": \"\\\"https://www.sueddeutsche.de/politik/kz-auschwitz-...\", \"title\": \"\\\"So war Auschwitz: Erinnerungen einer Holocaust-\\\\u...\", \"date\": \"\\\"00/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "es": {"config_name": "es", "sample_row": "{\"text\": \"\\\"De momento, no podemos responder a la pregunta fr...\", \"summary\": \"\\\"Sofres no ofrece datos por ser festivo.- Telecinc...\", \"topic\": \"\\\"elpais actualidad\\\"\", \"url\": \"\\\"http://elpais.com/elpais/2010/01/01/actualidad/12...\", \"title\": \"\\\"\\\\u00bfQui\\\\u00e9n gan\\\\u00f3 en las campanadas?\\\"\", \"date\": \"\\\"01/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"Jean-Jacques Schuhl, Gilles Leroy, Christian Gail...\", \"summary\": \"\\\"Jean-Jacques Schuhl, Gilles Leroy, Christian Gail...\", \"topic\": \"\\\"livres\\\"\", \"url\": \"\\\"https://www.lemonde.fr/livres/article/2010/01/01/...\", \"title\": \"\\\"La rentr\\\\u00e9e litt\\\\u00e9raire promet un program...\", \"date\": \"\\\"01/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "ru": {"config_name": "ru", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u043b\\\\u0430\\\\u0434\\\\u043e\\\\u0441\\\\u0442\\\\u0440\\\\...\", \"summary\": \"\\\"\\\\u0421\\\\u0442\\\\u0430\\\\u0440\\\\u0448\\\\u0438\\\\u0439 \\\\u043f...\", \"topic\": \"\\\"incident\\\"\", \"url\": \"\\\"https://www.mk.ru/incident/article/2010/01/05/409...\", \"title\": \"\\\"\\\\u041f\\\\u0435\\\\u0434\\\\u043e\\\\u0444\\\\u0438\\\\u043b \\\\u043f...\", \"date\": \"\\\"06/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "tu": {"config_name": "tu", "sample_row": "{\"text\": \"\\\"Ara\\\\u00e7 sahipleri i\\\\u00e7in pahal\\\\u0131 benzine...\", \"summary\": \"\\\"Benzinin litresi 4 liraya yakla\\\\u015ft\\\\u0131. Asl...\", \"topic\": \"\\\"unknown\\\"\", \"url\": \"\\\"https://www.internethaber.com/aracinizda-yuzde-30...\", \"title\": \"\\\"Arac\\\\u0131n\\\\u0131zda y\\\\u00fczde 30 tarassuf edin\\\"...\", \"date\": \"\\\"00/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}}, "tags": ["task_categories:summarization", "task_categories:translation", "task_categories:text-classification", "task_ids:news-articles-summarization", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:extended|cnn_dailymail", "source_datasets:original", "language:de", "language:es", "language:fr", "language:ru", "language:tr"], "is_gated": false}, "moroco": {"dataset_name": "moroco", "description": "The MOROCO (Moldavian and Romanian Dialectal Corpus) dataset contains 33564 samples of text collected from the news domain.\nThe samples belong to one of the following six topics:\n - culture\n - finance\n - politics\n - science\n - sports\n - tech", "downloads": 298, "configs": {"moroco": {"config_name": "moroco", "sample_row": "{\"id\": \"\\\"48482\\\"\", \"category\": \"2\", \"sample\": \"\\\"\\\\u201c$NE$ cum am spus, nu este un sf\\\\u00e2r\\\\u015...\"}", "columns": ["id", "category", "sample"], "columns_mapping": {"id": "id", "category": "category", "sample": "sample"}, "dataset_description": "The MOROCO (Moldavian and Romanian Dialectal Corpus) dataset contains 33564 samples of text collected from the news domain.\nThe samples belong to one of the following six topics:\n - culture\n - finance\n - politics\n - science\n - sports\n - tech\n", "dataset_name": "moroco"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ro"], "is_gated": false}, "movie_rationales": {"dataset_name": "movie_rationales", "description": "The movie rationale dataset contains human annotated rationales for movie\nreviews.", "downloads": 809, "configs": {"default": {"config_name": "default", "sample_row": "{\"review\": \"\\\"plot : two teen couples go to a church party , dr...\", \"label\": \"0\", \"evidences\": \"[\\\"mind - fuck movie\\\", \\\"the sad part is\\\", \\\"downshif...\"}", "columns": ["review", "label", "evidences"], "columns_mapping": {"review": "review", "label": "label", "evidences": "evidences"}, "dataset_description": "\nThe movie rationale dataset contains human annotated rationales for movie\nreviews.\n", "dataset_name": "movie_rationales"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "mrqa": {"dataset_name": "mrqa", "description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\n\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.", "downloads": 1286, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"subset\": \"\\\"SQuAD\\\"\", \"context\": \"\\\"Architecturally, the school has a Catholic charac...\", \"context_tokens.tokens\": \"[\\\"Architecturally\\\", \\\",\\\", \\\"the\\\", \\\"school\\\", \\\"has\\\", \\\"...\", \"context_tokens.offsets\": \"[0, 15, 17, 21, 28, 32, 34, 43, 52, 54, 59, 63, 68...\", \"qid\": \"\\\"38cc2597b6624bd8af1e8ba7f693096f\\\"\", \"question\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"question_tokens.tokens\": \"[\\\"To\\\", \\\"whom\\\", \\\"did\\\", \\\"the\\\", \\\"Virgin\\\", \\\"Mary\\\", \\\"al...\", \"question_tokens.offsets\": \"[0, 3, 8, 12, 16, 23, 28, 38, 45, 48, 53, 56, 64, ...\", \"detected_answers.text\": \"[\\\"Saint Bernadette Soubirous\\\"]\", \"detected_answers.char_spans\": \"[{\\\"start\\\": [515], \\\"end\\\": [540]}]\", \"detected_answers.token_spans\": \"[{\\\"start\\\": [102], \\\"end\\\": [104]}]\", \"answers\": \"[\\\"Saint Bernadette Soubirous\\\"]\"}", "columns": ["subset", "context", "context_tokens_tokens", "context_tokens_offsets", "qid", "question", "question_tokens_tokens", "question_tokens_offsets", "detected_answers_text", "detected_answers_char_spans", "detected_answers_token_spans", "answers"], "columns_mapping": {"subset": "subset", "context": "context", "context_tokens.tokens": "context_tokens_tokens", "context_tokens.offsets": "context_tokens_offsets", "qid": "qid", "question": "question", "question_tokens.tokens": "question_tokens_tokens", "question_tokens.offsets": "question_tokens_offsets", "detected_answers.text": "detected_answers_text", "detected_answers.char_spans": "detected_answers_char_spans", "detected_answers.token_spans": "detected_answers_token_spans", "answers": "answers"}, "dataset_description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\n\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.\n", "dataset_name": "mrqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|drop", "source_datasets:extended|hotpot_qa", "source_datasets:extended|natural_questions", "source_datasets:extended|race", "source_datasets:extended|search_qa", "source_datasets:extended|squad", "source_datasets:extended|trivia_qa", "language:en"], "is_gated": false}, "msr_sqa": {"dataset_name": "msr_sqa", "description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.", "downloads": 476, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"nt-639\\\"\", \"annotator\": \"0\", \"position\": \"0\", \"question\": \"\\\"where are the players from?\\\"\", \"question_and_history\": \"[\\\"where are the players from?\\\"]\", \"table_file\": \"\\\"table_csv/203_149.csv\\\"\", \"table_header\": \"[\\\"Pick\\\", \\\"Player\\\", \\\"Team\\\", \\\"Position\\\", \\\"School\\\"]...\", \"table_data\": \"[[\\\"1\\\", \\\"Ben McDonald\\\", \\\"Baltimore Orioles\\\", \\\"RHP\\\",...\", \"answer_coordinates.row_index\": \"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...\", \"answer_coordinates.column_index\": \"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...\", \"answer_text\": \"[\\\"Louisiana State University\\\", \\\"Valley HS (Las Veg...\"}", "columns": ["id", "annotator", "position", "question", "question_and_history", "table_file", "table_header", "table_data", "answer_coordinates_row_index", "answer_coordinates_column_index", "answer_text"], "columns_mapping": {"id": "id", "annotator": "annotator", "position": "position", "question": "question", "question_and_history": "question_and_history", "table_file": "table_file", "table_header": "table_header", "table_data": "table_data", "answer_coordinates.row_index": "answer_coordinates_row_index", "answer_coordinates.column_index": "answer_coordinates_column_index", "answer_text": "answer_text"}, "dataset_description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "dataset_name": "msr_sqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "msra_ner": {"dataset_name": "msra_ner", "description": "The Third International Chinese Language\nProcessing Bakeoff was held in Spring\n2006 to assess the state of the art in two\nimportant tasks: word segmentation and\nnamed entity recognition. Twenty-nine\ngroups submitted result sets in the two\ntasks across two tracks and a total of five\ncorpora. We found strong results in both\ntasks as well as continuing challenges.\n\nMSRA NER is one of the provided dataset.\nThere are three types of NE, PER (person),\nORG (organization) and LOC (location).\nThe dataset is in the BIO scheme.\n\nFor more details see https://faculty.washington.edu/levow/papers/sighan06.pdf", "downloads": 636, "configs": {"msra_ner": {"config_name": "msra_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5f53\\\", \\\"\\\\u5e0c\\\", \\\"\\\\u671b\\\", \\\"\\\\u5de5\\\", \\\"\\\\u7a0b\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The Third International Chinese Language\nProcessing Bakeoff was held in Spring\n2006 to assess the state of the art in two\nimportant tasks: word segmentation and\nnamed entity recognition. Twenty-nine\ngroups submitted result sets in the two\ntasks across two tracks and a total of five\ncorpora. We found strong results in both\ntasks as well as continuing challenges.\n\nMSRA NER is one of the provided dataset.\nThere are three types of NE, PER (person),\nORG (organization) and LOC (location).\nThe dataset is in the BIO scheme.\n\nFor more details see https://faculty.washington.edu/levow/papers/sighan06.pdf\n", "dataset_name": "msra_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "multi_news": {"dataset_name": "multi_news", "description": "Multi-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.", "downloads": 4950, "configs": {"default": {"config_name": "default", "sample_row": "{\"document\": \"\\\"National Archives \\\\n \\\\n Yes, it\\\\u2019s that time ...\", \"summary\": \"\\\"\\\\u2013 The unemployment rate dropped to 8.2% last...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "dataset_name": "multi_news"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "multi_nli": {"dataset_name": "multi_nli", "description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.", "downloads": 8551, "configs": {"default": {"config_name": "default", "sample_row": "{\"promptID\": \"31193\", \"pairID\": \"\\\"31193n\\\"\", \"premise\": \"\\\"Conceptually cream skimming has two basic dimensi...\", \"premise_binary_parse\": \"\\\"( ( Conceptually ( cream skimming ) ) ( ( has ( (...\", \"premise_parse\": \"\\\"(ROOT (S (NP (JJ Conceptually) (NN cream) (NN ski...\", \"hypothesis\": \"\\\"Product and geography are what make cream skimmin...\", \"hypothesis_binary_parse\": \"\\\"( ( ( Product and ) geography ) ( ( are ( what ( ...\", \"hypothesis_parse\": \"\\\"(ROOT (S (NP (NN Product) (CC and) (NN geography)...\", \"genre\": \"\\\"government\\\"\", \"label\": \"1\"}", "columns": ["promptID", "pairID", "premise", "premise_binary_parse", "premise_parse", "hypothesis", "hypothesis_binary_parse", "hypothesis_parse", "genre", "label"], "columns_mapping": {"promptID": "promptID", "pairID": "pairID", "premise": "premise", "premise_binary_parse": "premise_binary_parse", "premise_parse": "premise_parse", "hypothesis": "hypothesis", "hypothesis_binary_parse": "hypothesis_binary_parse", "hypothesis_parse": "hypothesis_parse", "genre": "genre", "label": "label"}, "dataset_description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.\n", "dataset_name": "multi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "multi_para_crawl": {"dataset_name": "multi_para_crawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.", "downloads": 848, "configs": {"cs-is": {"config_name": "cs-is", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"barva kv\\\\u011btina vinn\\\\u00fd, \\\\u0161e\\\\u0159\\\\u00e...\", \"translation.is\": \"\\\"bl\\\\u00f3m lit burgundy, lilac, bleikur, gr\\\\u00e6n...\"}", "columns": ["id", "translation_cs", "translation_is"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.is": "translation_is"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "ga-sk": {"config_name": "ga-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"T\\\\u00e1 na deities go l\\\\u00e9ir ceangailte go bea...\", \"translation.sk\": \"\\\"V\\\\u0161etky bo\\\\u017estv\\\\u00e1 s\\\\u00fa s nimi neja...\"}", "columns": ["id", "translation_ga", "translation_sk"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.sk": "translation_sk"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "lv-mt": {"config_name": "lv-mt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.lv\": \"\\\"Pirmais satur izaicin\\\\u0101jumu Kor\\\\u0101na, kas ...\", \"translation.mt\": \"\\\"L-ewwel jinkludi l-isfida ta 'l-Koran li hija l-K...\"}", "columns": ["id", "translation_lv", "translation_mt"], "columns_mapping": {"id": "id", "translation.lv": "translation_lv", "translation.mt": "translation_mt"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "nb-ru": {"config_name": "nb-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.nb\": \"\\\"-gir beskyttelse mot handlingen av solstr\\\\u00e5li...\", \"translation.ru\": \"\\\"-\\\\u0434\\\\u0430\\\\u0435\\\\u0442 \\\\u0437\\\\u0430\\\\u0449\\\\u043...\"}", "columns": ["id", "translation_nb", "translation_ru"], "columns_mapping": {"id": "id", "translation.nb": "translation_nb", "translation.ru": "translation_ru"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "de-tl": {"config_name": "de-tl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Weil Polizei nicht Verst\\\\u00e4rkersysteme im Bere...\", \"translation.tl\": \"\\\"Dahil ang pulis hindi papayagan ang paglaki mga s...\"}", "columns": ["id", "translation_de", "translation_tl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.tl": "translation_tl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:ga", "language:gl", "language:ha", "language:hr", "language:hu", "language:ig", "language:is", "language:it", "language:km", "language:lt", "language:lv", "language:mt", "language:my", "language:nb", "language:ne", "language:nl", "language:nn", "language:pl", "language:ps", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:so", "language:sv", "language:sw", "language:tl"], "is_gated": false}, "multi_x_science_sum": {"dataset_name": "multi_x_science_sum", "description": "Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.", "downloads": 793, "configs": {"default": {"config_name": "default", "sample_row": "{\"aid\": \"\\\"math9912167\\\"\", \"mid\": \"\\\"1631980677\\\"\", \"abstract\": \"\\\"Author(s): Kuperberg, Greg; Thurston, Dylan P. | ...\", \"related_work\": \"\\\"Two other generalizations that can be considered ...\", \"ref_abstract.cite_N\": \"[\\\"@cite_16\\\", \\\"@cite_26\\\"]\", \"ref_abstract.mid\": \"[\\\"1481005306\\\", \\\"1641082372\\\"]\", \"ref_abstract.abstract\": \"[\\\"This note is a sequel to our earlier paper of th...\"}", "columns": ["aid", "mid", "abstract", "related_work", "ref_abstract_cite_N", "ref_abstract_mid", "ref_abstract_abstract"], "columns_mapping": {"aid": "aid", "mid": "mid", "abstract": "abstract", "related_work": "related_work", "ref_abstract.cite_N": "ref_abstract_cite_N", "ref_abstract.mid": "ref_abstract_mid", "ref_abstract.abstract": "ref_abstract_abstract"}, "dataset_description": "\nMulti-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.\n", "dataset_name": "multi_x_science_sum"}}, "tags": ["task_categories:summarization", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "paper-abstract-generation"], "is_gated": false}, "multidoc2dial": {"dataset_name": "multidoc2dial", "description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.", "downloads": 1007, "configs": {"dialogue_domain": {"config_name": "dialogue_domain", "sample_row": "{\"dial_id\": \"\\\"8df07b7a98990db27c395cb1f68a962e\\\"\", \"domain\": \"\\\"dmv\\\"\", \"turns\": \"[{\\\"da\\\": \\\"query_condition\\\", \\\"references\\\": [{\\\"label\\\"...\"}", "columns": ["dial_id", "domain", "turns"], "columns_mapping": {"dial_id": "dial_id", "domain": "domain", "turns": "turns"}, "dataset_description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.\n", "dataset_name": "multidoc2dial"}, "document_domain": {"config_name": "document_domain", "sample_row": "{\"domain\": \"\\\"ssa\\\"\", \"doc_id\": \"\\\"Benefits Planner: Survivors | Planning For Your S...\", \"title\": \"\\\"Benefits Planner: Survivors | Planning For Your S...\", \"doc_text\": \"\\\"\\\\n\\\\nBenefits Planner: Survivors | Planning For Yo...\", \"spans\": \"[{\\\"id_sp\\\": \\\"1\\\", \\\"tag\\\": \\\"h2\\\", \\\"start_sp\\\": 0, \\\"end_s...\", \"doc_html_ts\": \"\\\"

\\\\nSubject: Alt....\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.graphics": {"config_name": "bydate_comp.graphics", "sample_row": "{\"text\": \"\\\"From: lipman@oasys.dt.navy.mil (Robert Lipman)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.os.ms-windows.misc": {"config_name": "bydate_comp.os.ms-windows.misc", "sample_row": "{\"text\": \"\\\"From: lipman@oasys.dt.navy.mil (Robert Lipman)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.sys.ibm.pc.hardware": {"config_name": "bydate_comp.sys.ibm.pc.hardware", "sample_row": "{\"text\": \"\\\"From: bobmon@cs.indiana.edu (Bob Montante)\\\\nSubje...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.sys.mac.hardware": {"config_name": "bydate_comp.sys.mac.hardware", "sample_row": "{\"text\": \"\\\"Subject: ** Need Advice ** (about Tech Works etc....\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.windows.x": {"config_name": "bydate_comp.windows.x", "sample_row": "{\"text\": \"\\\"From: chongo@toad.com (Landon C. Noll)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_misc.forsale": {"config_name": "bydate_misc.forsale", "sample_row": "{\"text\": \"\\\"From: kedz@bigwpi.WPI.EDU (John Kedziora)\\\\nSubjec...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.autos": {"config_name": "bydate_rec.autos", "sample_row": "{\"text\": \"\\\"From: dennisk@cs.uoregon.edu (Dennis Kennedy)\\\\nSu...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.motorcycles": {"config_name": "bydate_rec.motorcycles", "sample_row": "{\"text\": \"\\\"From: ivan@erich.triumf.ca (Ivan D. Reid)\\\\nSubjec...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.sport.baseball": {"config_name": "bydate_rec.sport.baseball", "sample_row": "{\"text\": \"\\\"From: admiral@jhunix.hcf.jhu.edu (Steve C Liu)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.sport.hockey": {"config_name": "bydate_rec.sport.hockey", "sample_row": "{\"text\": \"\\\"From: ayari@judikael.loria.fr (Ayari Iskander)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.crypt": {"config_name": "bydate_sci.crypt", "sample_row": "{\"text\": \"\\\"From: Marc VanHeyningen ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.electronics": {"config_name": "bydate_sci.electronics", "sample_row": "{\"text\": \"\\\"From: keith@radio.nl.nuwc.navy.mil\\\\nSubject: Tekt...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.med": {"config_name": "bydate_sci.med", "sample_row": "{\"text\": \"\\\"From: bed@intacc.uucp (Deb Waddington)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.space": {"config_name": "bydate_sci.space", "sample_row": "{\"text\": \"\\\"From: et@teal.csn.org (Eric H. Taylor)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_soc.religion.christian": {"config_name": "bydate_soc.religion.christian", "sample_row": "{\"text\": \"\\\"From: jenk@microsoft.com (Jen Kilmer)\\\\nSubject: R...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.guns": {"config_name": "bydate_talk.politics.guns", "sample_row": "{\"text\": \"\\\"From: manes@magpie.linknet.com (Steve Manes)\\\\nSub...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.mideast": {"config_name": "bydate_talk.politics.mideast", "sample_row": "{\"text\": \"\\\"From: sera@zuma.UUCP (Serdar Argic)\\\\nSubject: The...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.misc": {"config_name": "bydate_talk.politics.misc", "sample_row": "{\"text\": \"\\\"From: mpye@vmsb.is.csupomona.edu\\\\nSubject: Re: Me...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.religion.misc": {"config_name": "bydate_talk.religion.misc", "sample_row": "{\"text\": \"\\\"X-Mailer: TMail version 1.17R\\\\nFrom: \\\\\\\"D. C. Sess...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "newsph": {"dataset_name": "newsph", "description": "Large-scale dataset of Filipino news articles. Sourced for the NewsPH-NLI Project (Cruz et al., 2020).", "downloads": 290, "configs": {"newsph": {"config_name": "newsph", "sample_row": "{\"text\": \"\\\"= Task force tutugisin ang suspek sa pagpatay ng ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Large-scale dataset of Filipino news articles. Sourced for the NewsPH-NLI Project (Cruz et al., 2020).\n", "dataset_name": "newsph"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:fil", "language:tl"], "is_gated": false}, "newsph_nli": {"dataset_name": "newsph_nli", "description": "First benchmark dataset for sentence entailment in the low-resource Filipino language.\nConstructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,\nin 70-15-15 split for training, validation, and testing.", "downloads": 301, "configs": {"default": {"config_name": "default", "sample_row": "{\"premise\": \"\\\"\\\\\\\"Hindi ko ugali ang mamulitika; mas gusto kong t...\", \"hypothesis\": \"\\\"Ito ang dineklara ni Atty. Romulo Macalintal, abo...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "First benchmark dataset for sentence entailment in the low-resource Filipino language.\nConstructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,\nin 70-15-15 split for training, validation, and testing.\n", "dataset_name": "newsph_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "newspop": {"dataset_name": "newspop", "description": "This is a large data set of news items and their respective social feedback on multiple platforms: Facebook, Google+ and LinkedIn.\nThe collected data relates to a period of 8 months, between November 2015 and July 2016, accounting for about 100,000 news items on four different topics: economy, microsoft, obama and palestine.\nThis data set is tailored for evaluative comparisons in predictive analytics tasks, although allowing for tasks in other research areas such as topic detection and tracking, sentiment analysis in short text, first story detection or news recommendation.", "downloads": 373, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"99248\", \"title\": \"\\\"Obama Lays Wreath at Arlington National Cemetery\\\"...\", \"headline\": \"\\\"Obama Lays Wreath at Arlington National Cemetery....\", \"source\": \"\\\"USA TODAY\\\"\", \"topic\": \"\\\"obama\\\"\", \"publish_date\": \"\\\"2002-04-02 00:00:00\\\"\", \"facebook\": \"-1\", \"google_plus\": \"-1\", \"linked_in\": \"-1\"}", "columns": ["id", "title", "headline", "source", "topic", "publish_date", "facebook", "google_plus", "linked_in"], "columns_mapping": {"id": "id", "title": "title", "headline": "headline", "source": "source", "topic": "topic", "publish_date": "publish_date", "facebook": "facebook", "google_plus": "google_plus", "linked_in": "linked_in"}, "dataset_description": "\nThis is a large data set of news items and their respective social feedback on multiple platforms: Facebook, Google+ and LinkedIn.\nThe collected data relates to a period of 8 months, between November 2015 and July 2016, accounting for about 100,000 news items on four different topics: economy, microsoft, obama and palestine.\nThis data set is tailored for evaluative comparisons in predictive analytics tasks, although allowing for tasks in other research areas such as topic detection and tracking, sentiment analysis in short text, first story detection or news recommendation.\n", "dataset_name": "newspop"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "social-media-shares-prediction"], "is_gated": false}, "nkjp-ner": {"dataset_name": "nkjp-ner", "description": "The NKJP-NER is based on a human-annotated part of National Corpus of Polish (NKJP). We extracted sentences with named entities of exactly one type. The task is to predict the type of the named entity.", "downloads": 288, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"- Widzi pani , a Blokowa wzi\\\\u0119\\\\u0142a i si\\\\u0...\", \"target\": \"1\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "The NKJP-NER is based on a human-annotated part of National Corpus of Polish (NKJP). We extracted sentences with named entities of exactly one type. The task is to predict the type of the named entity.\n", "dataset_name": "nkjp-ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "nli_tr": {"dataset_name": "nli_tr", "description": "\\\r\nThe Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.", "downloads": 854, "configs": {"snli_tr": {"config_name": "snli_tr", "sample_row": "{\"idx\": \"0\", \"premise\": \"\\\"Attaki bir ki\\\\u015fi, bozuk bir u\\\\u00e7a\\\\u011f\\\\u0...\", \"hypothesis\": \"\\\"Bir ki\\\\u015fi at\\\\u0131n\\\\u0131 yar\\\\u0131\\\\u015fma i...\", \"label\": \"1\"}", "columns": ["idx", "premise", "hypothesis", "label"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.\n", "dataset_name": "nli_tr"}, "multinli_tr": {"config_name": "multinli_tr", "sample_row": "{\"idx\": \"0\", \"premise\": \"\\\"Kavramsal olarak krem kayma\\\\u011f\\\\u0131n\\\\u0131n i...\", \"hypothesis\": \"\\\"\\\\u00dcr\\\\u00fcn ve co\\\\u011frafya krem kayma\\\\u011f\\\\...\", \"label\": \"1\"}", "columns": ["idx", "premise", "hypothesis", "label"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.\n", "dataset_name": "nli_tr"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|snli", "source_datasets:extended|multi_nli", "language:tr"], "is_gated": false}, "nlu_evaluation_data": {"dataset_name": "nlu_evaluation_data", "description": "Raw part of NLU Evaluation Data. It contains 25 715 non-empty examples (original dataset has 25716 examples) from 68 unique intents belonging to 18 scenarios.", "downloads": 418, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"wake me up at five am this week\\\"\", \"scenario\": \"\\\"alarm\\\"\", \"label\": \"2\"}", "columns": ["text", "scenario", "label"], "columns_mapping": {"text": "text", "scenario": "scenario", "label": "label"}, "dataset_description": "Raw part of NLU Evaluation Data. It contains 25 715 non-empty examples (original dataset has 25716 examples) from 68 unique intents belonging to 18 scenarios.\n", "dataset_name": "nlu_evaluation_data"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "nq_open": {"dataset_name": "nq_open", "description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.", "downloads": 16377, "configs": {"nq_open": {"config_name": "nq_open", "sample_row": "{\"question\": \"\\\"where did they film hot tub time machine\\\"\", \"answer\": \"[\\\"Fernie Alpine Resort\\\"]\"}", "columns": ["question", "answer"], "columns_mapping": {"question": "question", "answer": "answer"}, "dataset_description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "dataset_name": "nq_open"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "language:en"], "is_gated": false}, "nsmc": {"dataset_name": "nsmc", "description": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.", "downloads": 2815, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"9976970\\\"\", \"document\": \"\\\"\\\\uc544 \\\\ub354\\\\ube59.. \\\\uc9c4\\\\uc9dc \\\\uc9dc\\\\uc99d\\\\u...\", \"label\": \"0\"}", "columns": ["id", "document", "label"], "columns_mapping": {"id": "id", "document": "document", "label": "label"}, "dataset_description": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.\n", "dataset_name": "nsmc"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ko"], "is_gated": false}, "numer_sense": {"dataset_name": "numer_sense", "description": "NumerSense is a new numerical commonsense reasoning probing task, with a diagnostic dataset consisting of 3,145 masked-word-prediction probes.\n\nWe propose to study whether numerical commonsense knowledge can be induced from pre-trained language models like BERT, and to what extent this access to knowledge robust against adversarial examples is. We hope this will be beneficial for tasks such as knowledge base completion and open-domain question answering.", "downloads": 449, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"Some plant varieties can grow up to feet t...\", \"target\": \"\\\"nine\\\"\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "NumerSense is a new numerical commonsense reasoning probing task, with a diagnostic dataset consisting of 3,145 masked-word-prediction probes.\n\nWe propose to study whether numerical commonsense knowledge can be induced from pre-trained language models like BERT, and to what extent this access to knowledge robust against adversarial examples is. We hope this will be beneficial for tasks such as knowledge base completion and open-domain question answering.\n", "dataset_name": "numer_sense"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other", "language:en"], "is_gated": false}, "numeric_fused_head": {"dataset_name": "numeric_fused_head", "description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).", "downloads": 431, "configs": {"identification": {"config_name": "identification", "sample_row": "{\"tokens\": \"[\\\"There\\\", \\\"is\\\", \\\"3500\\\", \\\"...\\\", \\\"and\\\", \\\"since\\\", \\\"yo...\", \"start_index\": \"2\", \"end_index\": \"3\", \"label\": \"1\"}", "columns": ["tokens", "start_index", "end_index", "label"], "columns_mapping": {"tokens": "tokens", "start_index": "start_index", "end_index": "end_index", "label": "label"}, "dataset_description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).\n", "dataset_name": "numeric_fused_head"}, "resolution": {"config_name": "resolution", "sample_row": "{\"tokens\": \"[\\\"What\\\", \\\"the\\\", \\\"fuck\\\", \\\"are\\\", \\\"you\\\", \\\"doing\\\", \\\"?\\\"...\", \"line_indices\": \"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3...\", \"head\": \"[\\\"AGE\\\"]\", \"speakers\": \"[\\\"Stuart Alan Jones\\\", \\\"Stuart Alan Jones\\\", \\\"Stuart...\", \"anchors_indices\": \"[12]\"}", "columns": ["tokens", "line_indices", "head", "speakers", "anchors_indices"], "columns_mapping": {"tokens": "tokens", "line_indices": "line_indices", "head": "head", "speakers": "speakers", "anchors_indices": "anchors_indices"}, "dataset_description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).\n", "dataset_name": "numeric_fused_head"}}, "tags": ["task_categories:token-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "fused-head-identification"], "is_gated": false}, "oclar": {"dataset_name": "oclar", "description": "The researchers of OCLAR Marwan et al. (2019), they gathered Arabic costumer reviews from Google reviewsa and Zomato\nwebsite (https://www.zomato.com/lebanon) on wide scope of domain, including restaurants, hotels, hospitals, local shops,\netc.The corpus finally contains 3916 reviews in 5-rating scale. For this research purpose, the positive class considers\nrating stars from 5 to 3 of 3465 reviews, and the negative class is represented from values of 1 and 2 of about\n451 texts.", "downloads": 291, "configs": {"default": {"config_name": "default", "sample_row": "{\"pagename\": \"\\\"Beirut Golden Plaza Suites\\\"\", \"review\": \"\\\"\\\\u0647\\\\u0630\\\\u0627 \\\\u0627\\\\u0644\\\\u0641\\\\u0646\\\\u062f...\", \"rating\": \"2\"}", "columns": ["pagename", "review", "rating"], "columns_mapping": {"pagename": "pagename", "review": "review", "rating": "rating"}, "dataset_description": "The researchers of OCLAR Marwan et al. (2019), they gathered Arabic costumer reviews from Google reviewsa and Zomato\nwebsite (https://www.zomato.com/lebanon) on wide scope of domain, including restaurants, hotels, hospitals, local shops,\netc.The corpus finally contains 3916 reviews in 5-rating scale. For this research purpose, the positive class considers\nrating stars from 5 to 3 of 3465 reviews, and the negative class is represented from values of 1 and 2 of about\n451 texts.\n", "dataset_name": "oclar"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:sentiment-classification", "task_ids:sentiment-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "offenseval2020_tr": {"dataset_name": "offenseval2020_tr", "description": "OffensEval-TR 2020 is a Turkish offensive language corpus. The corpus consist of randomly sampled tweets and annotated in a similar way to OffensEval and GermEval.", "downloads": 302, "configs": {"offenseval2020-turkish": {"config_name": "offenseval2020-turkish", "sample_row": "{\"id\": \"20948\", \"tweet\": \"\\\"@USER en g\\\\u00fczel uyuyan insan \\\\u00f6d\\\\u00fcl\\\\u...\", \"subtask_a\": \"0\"}", "columns": ["id", "tweet", "subtask_a"], "columns_mapping": {"id": "id", "tweet": "tweet", "subtask_a": "subtask_a"}, "dataset_description": "OffensEval-TR 2020 is a Turkish offensive language corpus. The corpus consist of randomly sampled tweets and annotated in a similar way to OffensEval and GermEval.\n", "dataset_name": "offenseval2020_tr"}}, "tags": ["task_categories:text-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:tr", "offensive-language-classification"], "is_gated": false}, "ofis_publik": {"dataset_name": "ofis_publik", "description": "Texts from the Ofis Publik ar Brezhoneg (Breton Language Board) provided by Francis Tyers\n2 languages, total number of files: 278\ntotal number of tokens: 2.12M\ntotal number of sentence fragments: 0.13M", "downloads": 288, "configs": {"br-fr": {"config_name": "br-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Ema\\\\u00f1 Rannvro Breizh hag he c\\\\u2019hevelerien...\", \"translation.fr\": \"\\\"La R\\\\u00e9gion Bretagne et ses partenaires se pr\\\\...\"}", "columns": ["id", "translation_br", "translation_fr"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.fr": "translation_fr"}, "dataset_description": "Texts from the Ofis Publik ar Brezhoneg (Breton Language Board) provided by Francis Tyers\n2 languages, total number of files: 278\ntotal number of tokens: 2.12M\ntotal number of sentence fragments: 0.13M\n", "dataset_name": "ofis_publik"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:br", "language:fr"], "is_gated": false}, "onestop_qa": {"dataset_name": "onestop_qa", "description": "OneStopQA is a multiple choice reading comprehension dataset annotated according to the STARC (Structured Annotations for Reading Comprehension) scheme. The reading materials are Guardian articles taken from the [OneStopEnglish corpus](https://github.com/nishkalavallabhi/OneStopEnglishCorpus). Each article comes in three difficulty levels, Elementary, Intermediate and Advanced. Each paragraph is annotated with three multiple choice reading comprehension questions. The reading comprehension questions can be answered based on any of the three paragraph levels.", "downloads": 295, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"101-Year-Old Bottle Message\\\"\", \"paragraph\": \"\\\"Angela Erdmann never knew her grandfather. He die...\", \"level\": \"0\", \"question\": \"\\\"Who threw the bottle into the Baltic Sea?\\\"\", \"paragraph_index\": \"0\", \"answers\": \"[\\\"Angela Erdmann\\\\u2019s grandfather\\\", \\\"Angela Erdm...\", \"a_span\": \"[0, 45]\", \"d_span\": \"[63, 63]\"}", "columns": ["title", "paragraph", "level", "question", "paragraph_index", "answers", "a_span", "d_span"], "columns_mapping": {"title": "title", "paragraph": "paragraph", "level": "level", "question": "question", "paragraph_index": "paragraph_index", "answers": "answers", "a_span": "a_span", "d_span": "d_span"}, "dataset_description": "OneStopQA is a multiple choice reading comprehension dataset annotated according to the STARC (Structured Annotations for Reading Comprehension) scheme. The reading materials are Guardian articles taken from the [OneStopEnglish corpus](https://github.com/nishkalavallabhi/OneStopEnglishCorpus). Each article comes in three difficulty levels, Elementary, Intermediate and Advanced. Each paragraph is annotated with three multiple choice reading comprehension questions. The reading comprehension questions can be answered based on any of the three paragraph levels.\n", "dataset_name": "onestop_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "source_datasets:extended|onestop_english", "language:en"], "is_gated": false}, "open_subtitles": {"dataset_name": "open_subtitles", "description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G", "downloads": 3487, "configs": {"bs-eo": {"config_name": "bs-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1973\", \"meta.imdbId\": \"70215\", \"meta.subtitleId.bs\": \"6080330\", \"meta.subtitleId.eo\": \"4010963\", \"meta.sentenceIds.bs\": \"[1]\", \"meta.sentenceIds.eo\": \"[2]\", \"translation.bs\": \"\\\"Gospodine Borgard...\\\"\", \"translation.eo\": \"\\\"Alvenis la respondo por vi el Nov-Orleano.\\\"\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_bs", "meta_subtitleId_eo", "meta_sentenceIds_bs", "meta_sentenceIds_eo", "translation_bs", "translation_eo"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.bs": "meta_subtitleId_bs", "meta.subtitleId.eo": "meta_subtitleId_eo", "meta.sentenceIds.bs": "meta_sentenceIds_bs", "meta.sentenceIds.eo": "meta_sentenceIds_eo", "translation.bs": "translation_bs", "translation.eo": "translation_eo"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "fr-hy": {"config_name": "fr-hy", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1971\", \"meta.imdbId\": \"67372\", \"meta.subtitleId.fr\": \"3693493\", \"meta.subtitleId.hy\": \"6711716\", \"meta.sentenceIds.fr\": \"[1]\", \"meta.sentenceIds.hy\": \"[1]\", \"translation.fr\": \"\\\"A quand rendez-vous prochain ?\\\"\", \"translation.hy\": \"\\\"\\\\u054e\\\\u0565\\\\u0570\\\\u0568 \\\\u0566\\\\u0561\\\\u0566\\\\u0580...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_fr", "meta_subtitleId_hy", "meta_sentenceIds_fr", "meta_sentenceIds_hy", "translation_fr", "translation_hy"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.fr": "meta_subtitleId_fr", "meta.subtitleId.hy": "meta_subtitleId_hy", "meta.sentenceIds.fr": "meta_sentenceIds_fr", "meta.sentenceIds.hy": "meta_sentenceIds_hy", "translation.fr": "translation_fr", "translation.hy": "translation_hy"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "da-ru": {"config_name": "da-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1927\", \"meta.imdbId\": \"17136\", \"meta.subtitleId.da\": \"61728\", \"meta.subtitleId.ru\": \"42690\", \"meta.sentenceIds.da\": \"[1, 2]\", \"meta.sentenceIds.ru\": \"[1]\", \"translation.da\": \"\\\"Hver epoke skaber sin efterf\\\\u00f8lger - Jules Mi...\", \"translation.ru\": \"\\\"\\\\u041a\\\\u0430\\\\u0436\\\\u0434\\\\u0430\\\\u044f \\\\u044d\\\\u043f...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_da", "meta_subtitleId_ru", "meta_sentenceIds_da", "meta_sentenceIds_ru", "translation_da", "translation_ru"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.da": "meta_subtitleId_da", "meta.subtitleId.ru": "meta_subtitleId_ru", "meta.sentenceIds.da": "meta_sentenceIds_da", "meta.sentenceIds.ru": "meta_sentenceIds_ru", "translation.da": "translation_da", "translation.ru": "translation_ru"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "en-hi": {"config_name": "en-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1948\", \"meta.imdbId\": \"40522\", \"meta.subtitleId.en\": \"4180294\", \"meta.subtitleId.hi\": \"4239106\", \"meta.sentenceIds.en\": \"[1]\", \"meta.sentenceIds.hi\": \"[1]\", \"translation.en\": \"\\\"THE BICYCLE THIEF\\\"\", \"translation.hi\": \"\\\"\\\\u0938\\\\u093e\\\\u0907\\\\u0915\\\\u093f\\\\u0932 \\\\u091a\\\\u094b...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_en", "meta_subtitleId_hi", "meta_sentenceIds_en", "meta_sentenceIds_hi", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.en": "meta_subtitleId_en", "meta.subtitleId.hi": "meta_subtitleId_hi", "meta.sentenceIds.en": "meta_sentenceIds_en", "meta.sentenceIds.hi": "meta_sentenceIds_hi", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "bn-is": {"config_name": "bn-is", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1981\", \"meta.imdbId\": \"82971\", \"meta.subtitleId.bn\": \"6443778\", \"meta.subtitleId.is\": \"4634729\", \"meta.sentenceIds.bn\": \"[2]\", \"meta.sentenceIds.is\": \"[2]\", \"translation.bn\": \"\\\"\\\\u09b9\\\\u09ac\\\\u09bf\\\\u099f\\\\u09cb\\\\u09b8 \\\\u0995\\\\u09be...\", \"translation.is\": \"\\\"Eitri\\\\u00f0 er enn \\\\u00f6flugt.\\\"\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_bn", "meta_subtitleId_is", "meta_sentenceIds_bn", "meta_sentenceIds_is", "translation_bn", "translation_is"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.bn": "meta_subtitleId_bn", "meta.subtitleId.is": "meta_subtitleId_is", "meta.sentenceIds.bn": "meta_sentenceIds_bn", "meta.sentenceIds.is": "meta_sentenceIds_is", "translation.bn": "translation_bn", "translation.is": "translation_is"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:ar", "language:bg", "language:bn", "language:br", "language:bs", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:gl", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:id", "language:is", "language:it", "language:ja", "language:ka", "language:kk", "language:ko", "language:lt", "language:lv", "language:mk", "language:ml", "language:ms", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:sq", "language:sr", "language:sv", "language:ta", "language:te", "language:th", "language:tl", "language:tr", "language:uk", "language:ur", "language:vi", "language:zh"], "is_gated": false}, "openbookqa": {"dataset_name": "openbookqa", "description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.", "downloads": 106856, "configs": {"main": {"config_name": "main", "sample_row": "{\"id\": \"\\\"7-980\\\"\", \"question_stem\": \"\\\"The sun is responsible for\\\"\", \"choices.text\": \"[\\\"puppies learning new tricks\\\", \\\"children growing ...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"D\\\"\"}", "columns": ["id", "question_stem", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question_stem": "question_stem", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.\n", "dataset_name": "openbookqa"}, "additional": {"config_name": "additional", "sample_row": "{\"id\": \"\\\"7-980\\\"\", \"question_stem\": \"\\\"The sun is responsible for\\\"\", \"choices.text\": \"[\\\"puppies learning new tricks\\\", \\\"children growing ...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"D\\\"\", \"fact1\": \"\\\"the sun is the source of energy for physical cycl...\", \"humanScore\": \"1.0\", \"clarity\": \"2.0\", \"turkIdAnonymized\": \"\\\"b356d338b7\\\"\"}", "columns": ["id", "question_stem", "choices_text", "choices_label", "answerKey", "fact1", "humanScore", "clarity", "turkIdAnonymized"], "columns_mapping": {"id": "id", "question_stem": "question_stem", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey", "fact1": "fact1", "humanScore": "humanScore", "clarity": "clarity", "turkIdAnonymized": "turkIdAnonymized"}, "dataset_description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.\n", "dataset_name": "openbookqa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "Skylion007/openwebtext": {"dataset_name": "Skylion007/openwebtext", "description": "An open-source replication of the WebText dataset from OpenAI.", "downloads": 7369, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"Port-au-Prince, Haiti (CNN) -- Earthquake victims...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "An open-source replication of the WebText dataset from OpenAI.\n", "dataset_name": "Skylion007/openwebtext"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "opinosis": {"dataset_name": "opinosis", "description": "The Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.\nTopics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.", "downloads": 765, "configs": {"default": {"config_name": "default", "sample_row": "{\"review_sents\": \"\\\", and is very, very accurate .\\\\r\\\\n but for the mo...\", \"summaries\": \"[\\\"This unit is generally quite accurate. \\\\r\\\\nSet-...\"}", "columns": ["review_sents", "summaries"], "columns_mapping": {"review_sents": "review_sents", "summaries": "summaries"}, "dataset_description": "\nThe Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.\nTopics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.\n", "dataset_name": "opinosis"}}, "tags": ["task_categories:summarization", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "abstractive-summarization"], "is_gated": false}, "opus_books": {"dataset_name": "opus_books", "description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M", "downloads": 22187, "configs": {"ca-de": {"config_name": "ca-de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.de\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_de"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.de": "translation_de"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-en": {"config_name": "ca-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_en"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-en": {"config_name": "el-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_en"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-eo": {"config_name": "de-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Translation: Antonie Zimmermann\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\"}", "columns": ["id", "translation_de", "translation_eo"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.eo": "translation_eo"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-eo": {"config_name": "en-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"ALICE'S ADVENTURES IN WONDERLAND\\\"\", \"translation.eo\": \"\\\"La aventuroj de Alicio en Mirlando\\\"\"}", "columns": ["id", "translation_en", "translation_eo"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.eo": "translation_eo"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-es": {"config_name": "de-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\"}", "columns": ["id", "translation_de", "translation_es"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-es": {"config_name": "el-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.es\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_es"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.es\": \"\\\"Source: Wikisource & librodot.com\\\"\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-es": {"config_name": "eo-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.es\": \"\\\"Source: http://mimosa.pntic.mec.es/\\\"\"}", "columns": ["id", "translation_eo", "translation_es"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_en", "translation_fi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-fi": {"config_name": "es-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & bibliotecasvirtuales.com\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_es", "translation_fi"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fi": "translation_fi"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-fr": {"config_name": "el-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.fr\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_fr"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"The Wanderer\\\"\", \"translation.fr\": \"\\\"Le grand Meaulnes\\\"\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-fr": {"config_name": "eo-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.fr\": \"\\\"Source: WikisourceTranslation: Henri Bu\\\\u00e9\\\"\"}", "columns": ["id", "translation_eo", "translation_fr"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\"}", "columns": ["id", "translation_es", "translation_fr"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-fr": {"config_name": "fi-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\"}", "columns": ["id", "translation_fi", "translation_fr"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-hu": {"config_name": "ca-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Koroknay Istv\\\\u00...\"}", "columns": ["id", "translation_ca", "translation_hu"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-hu": {"config_name": "de-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Ruzitska M\\\\u00e1r...\"}", "columns": ["id", "translation_de", "translation_hu"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-hu": {"config_name": "el-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Kem\\\\u00e9ny G\\\\u00...\"}", "columns": ["id", "translation_el", "translation_hu"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-hu": {"config_name": "en-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Szenczi Mikl\\\\u00f...\"}", "columns": ["id", "translation_en", "translation_hu"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-hu": {"config_name": "eo-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.hu\": \"\\\"Source: mek.oszk.huAudiobook available here\\\"\"}", "columns": ["id", "translation_eo", "translation_hu"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-hu": {"config_name": "fr-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: ebooksgratuits.comTranslation: V. Leconte...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Szenczi Mikl\\\\u00f...\"}", "columns": ["id", "translation_fr", "translation_hu"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-it": {"config_name": "de-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_de", "translation_it"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-it": {"config_name": "en-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_en", "translation_it"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-it": {"config_name": "eo-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.it\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_eo", "translation_it"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-it": {"config_name": "es-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_es", "translation_it"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-it": {"config_name": "fr-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_fr", "translation_it"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-it": {"config_name": "hu-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Ruzitska M\\\\u00e1r...\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_hu", "translation_it"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-nl": {"config_name": "ca-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_nl"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-nl": {"config_name": "de-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_de", "translation_nl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-nl": {"config_name": "en-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.nl\": \"\\\"Source: Project GutenbergTranslation: Gonne Van U...\"}", "columns": ["id", "translation_en", "translation_nl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-nl": {"config_name": "es-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & librodot.com\\\"\", \"translation.nl\": \"\\\"Source: Project GutenbergTranslation: Gonne Van U...\"}", "columns": ["id", "translation_es", "translation_nl"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-nl": {"config_name": "fr-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fr", "translation_nl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-nl": {"config_name": "hu-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: http://mek.oszk.hu/Translation: Tam\\\\u00e1...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_hu", "translation_nl"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-nl": {"config_name": "it-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Source: WikisourceTranslation: Gaetano Barbieri\\\"...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_it", "translation_nl"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-no": {"config_name": "en-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_en", "translation_no"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-no": {"config_name": "es-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & bibliotecasvirtuales.com\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_es", "translation_no"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-no": {"config_name": "fi-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_fi", "translation_no"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-no": {"config_name": "fr-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_fr", "translation_no"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-no": {"config_name": "hu-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Antal \\\\u00c1rkos\\\"...\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_hu", "translation_no"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-pl": {"config_name": "en-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_en", "translation_pl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-pl": {"config_name": "fi-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fi", "translation_pl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-pl": {"config_name": "fr-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fr", "translation_pl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-pl": {"config_name": "hu-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Antal \\\\u00c1rkos\\\"...\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_hu", "translation_pl"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-pt": {"config_name": "de-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Translation: Antonie Zimmermann\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_de", "translation_pt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-pt": {"config_name": "en-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"ALICE'S ADVENTURES IN WONDERLAND\\\"\", \"translation.pt\": \"\\\"Alice no Pa\\\\u00eds das Maravilhas\\\"\"}", "columns": ["id", "translation_en", "translation_pt"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-pt": {"config_name": "eo-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_eo", "translation_pt"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-pt": {"config_name": "es-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://mimosa.pntic.mec.es/\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_es", "translation_pt"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-pt": {"config_name": "fr-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: WikisourceTranslation: Henri Bu\\\\u00e9\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_fr", "translation_pt"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-pt": {"config_name": "hu-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huAudiobook available here\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_hu", "translation_pt"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-pt": {"config_name": "it-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_it", "translation_pt"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-ru": {"config_name": "de-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_de", "translation_ru"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_en", "translation_ru"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Ana Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_es", "translation_ru"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"LE MA\\\\u00ceTRE ET MARGUERITE\\\"\", \"translation.ru\": \"\\\"\\\\u041c\\\\u0430\\\\u0441\\\\u0442\\\\u0435\\\\u0440 \\\\u0438 \\\\u041...\"}", "columns": ["id", "translation_fr", "translation_ru"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-ru": {"config_name": "hu-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"A Mester \\\\u00e9s Margarita\\\"\", \"translation.ru\": \"\\\"\\\\u041c\\\\u0430\\\\u0441\\\\u0442\\\\u0435\\\\u0440 \\\\u0438 \\\\u041...\"}", "columns": ["id", "translation_hu", "translation_ru"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-ru": {"config_name": "it-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_it", "translation_ru"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_en", "translation_sv"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: ebooksgratuits.comAudiobook available her...\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_fr", "translation_sv"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-sv": {"config_name": "it-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Translation: Silvio Spaventa Filippi\\\"\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_it", "translation_sv"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ca", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:fi", "language:fr", "language:hu", "language:it", "language:nl", "language:no", "language:pl", "language:pt", "language:ru", "language:sv"], "is_gated": false}, "opus_dgt": {"dataset_name": "opus_dgt", "description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M", "downloads": 1536, "configs": {"bg-ga": {"config_name": "bg-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u041f\\\\u0440\\\\u043e\\\\u0442\\\\u043e\\\\u043a\\\\u043e\\\\u043b ...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_bg", "translation_ga"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "bg-hr": {"config_name": "bg-hr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u0420\\\\u0435\\\\u0448\\\\u0435\\\\u043d\\\\u0438\\\\u0435 \\\\u043d...\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\"}", "columns": ["id", "translation_bg", "translation_hr"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.hr": "translation_hr"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "bg-sh": {"config_name": "bg-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u041f\\\\u0440\\\\u043e\\\\u0442\\\\u043e\\\\u043a\\\\u043e\\\\u043b ...\", \"translation.sh\": \"\\\"Ispravak Drugog dodatnog protokola uz Sporazum o ...\"}", "columns": ["id", "translation_bg", "translation_sh"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "fi-ga": {"config_name": "fi-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Oikaisup\\\\u00f6yt\\\\u00e4kirja yleissopimukseen tuom...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_fi", "translation_ga"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "es-ga": {"config_name": "es-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Acta de correcci\\\\u00f3n de errores del Convenio r...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_es", "translation_ga"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "ga-sh": {"config_name": "ga-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"Leasuithe ar na hIarscr\\\\u00edbhinn\\\\u00ed a ghabha...\", \"translation.sh\": \"\\\"Izmjene prilog\\\\u00e2 Konvenciji iz Lugana od 30. ...\"}", "columns": ["id", "translation_ga", "translation_sh"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "hr-sk": {"config_name": "hr-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\", \"translation.sk\": \"\\\"Rozhodnutie Spolo\\\\u010dn\\\\u00e9ho v\\\\u00fdboru EHP\\\"...\"}", "columns": ["id", "translation_hr", "translation_sk"], "columns_mapping": {"id": "id", "translation.hr": "translation_hr", "translation.sk": "translation_sk"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "mt-sh": {"config_name": "mt-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.mt\": \"\\\"Verbal ta' rettifika tat-tieni protokoll addizzjo...\", \"translation.sh\": \"\\\"Ispravak Drugog dodatnog protokola uz Sporazum o ...\"}", "columns": ["id", "translation_mt", "translation_sh"], "columns_mapping": {"id": "id", "translation.mt": "translation_mt", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "hr-sv": {"config_name": "hr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\", \"translation.sv\": \"\\\"Gemensamma EES-kommitt\\\\u00e9ns beslut\\\"\"}", "columns": ["id", "translation_hr", "translation_sv"], "columns_mapping": {"id": "id", "translation.hr": "translation_hr", "translation.sv": "translation_sv"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "ga-nl": {"config_name": "ga-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\", \"translation.nl\": \"\\\"Proces-verbaal van verbetering van het Verdrag be...\"}", "columns": ["id", "translation_ga", "translation_nl"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.nl": "translation_nl"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:ga", "language:hr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sh", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "opus_dogc": {"dataset_name": "opus_dogc", "description": "This is a collection of documents from the Official Journal of the Government of Catalonia, in Catalan and Spanish languages, provided by Antoni Oliver Gonzalez from the Universitat Oberta de Catalunya.", "downloads": 283, "configs": {"tmx": {"config_name": "tmx", "sample_row": "{\"translation.ca\": \"\\\"En virtut de l ' annex 1 del Reial decret 2346 / ...\", \"translation.es\": \"\\\"En virtud del anexo 1 del Real decreto 2346/ 1996...\"}", "columns": ["translation_ca", "translation_es"], "columns_mapping": {"translation.ca": "translation_ca", "translation.es": "translation_es"}, "dataset_description": "This is a collection of documents from the Official Journal of the Government of Catalonia, in Catalan and Spanish languages, provided by Antoni Oliver Gonzalez from the Universitat Oberta de Catalunya.\n", "dataset_name": "opus_dogc"}}, "tags": ["task_categories:translation", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:ca", "language:es"], "is_gated": false}, "opus_gnome": {"dataset_name": "opus_gnome", "description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M", "downloads": 1556, "configs": {"ar-bal": {"config_name": "ar-bal", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"\\\\u0625\\\\u0639\\\\u062f\\\\u0627\\\\u062f \\\\u0633\\\\u064a\\\\u0627...\", \"translation.bal\": \"\\\"\\\\u062a\\\\u0646\\\\u0638\\\\u06cc\\\\u0645 \\\\u06a9\\\\u062a\\\\u0646...\"}", "columns": ["id", "translation_ar", "translation_bal"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.bal": "translation_bal"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "bg-csb": {"config_name": "bg-csb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"GNOME\\\"\", \"translation.csb\": \"\\\"GNOME\\\"\"}", "columns": ["id", "translation_bg", "translation_csb"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.csb": "translation_csb"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "ca-en_GB": {"config_name": "ca-en_GB", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Accerciser\\\"\", \"translation.en_GB\": \"\\\"Accerciser\\\"\"}", "columns": ["id", "translation_ca", "translation_en_GB"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.en_GB": "translation_en_GB"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "cs-eo": {"config_name": "cs-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"Seznam ve v\\\\u00fdchoz\\\\u00edm nastaven\\\\u00ed zak\\\\u...\", \"translation.eo\": \"\\\"Listo de kromprogramoj kiuj defa\\\\u016dlte estas e...\"}", "columns": ["id", "translation_cs", "translation_eo"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.eo": "translation_eo"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "de-ha": {"config_name": "de-ha", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Info zu GNOME\\\"\", \"translation.ha\": \"\\\"Game da GNOME\\\"\"}", "columns": ["id", "translation_de", "translation_ha"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.ha": "translation_ha"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "cs-tk": {"config_name": "cs-tk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"GNOME\\\"\", \"translation.tk\": \"\\\"GNOME\\\"\"}", "columns": ["id", "translation_cs", "translation_tk"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.tk": "translation_tk"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "da-vi": {"config_name": "da-vi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.da\": \"\\\"Giv dit program en tilg\\\\u00e6ngelighedsoverhaling...\", \"translation.vi\": \"\\\"Th\\\\u1eed ra kh\\\\u1ea3 n\\\\u0103ng truy c\\\\u1eadp c\\\\u1...\"}", "columns": ["id", "translation_da", "translation_vi"], "columns_mapping": {"id": "id", "translation.da": "translation_da", "translation.vi": "translation_vi"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "en_GB-my": {"config_name": "en_GB-my", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en_GB\": \"\\\"Results %i\\\\u2013%i (out of %i)\\\"\", \"translation.my\": \"\\\"\\\\u101b\\\\u101c\\\\u1012\\\\u103a %i\\\\u2013%i ( %i \\\\u1019\\\\u...\"}", "columns": ["id", "translation_en_GB", "translation_my"], "columns_mapping": {"id": "id", "translation.en_GB": "translation_en_GB", "translation.my": "translation_my"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "el-sk": {"config_name": "el-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Accerciser\\\"\", \"translation.sk\": \"\\\"Accerciser\\\"\"}", "columns": ["id", "translation_el", "translation_sk"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.sk": "translation_sk"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "de-tt": {"config_name": "de-tt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Ausgew\\\\u00e4hlter Link\\\"\", \"translation.tt\": \"\\\"Saylan\\\\u011fan B\\\\u00e4y\\\"\"}", "columns": ["id", "translation_de", "translation_tt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.tt": "translation_tt"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:an", "language:ang", "language:ar", "language:as", "language:ast", "language:az", "language:bal", "language:be", "language:bem", "language:bg", "language:bn", "language:bo", "language:br", "language:brx", "language:bs", "language:ca", "language:crh", "language:cs", "language:csb", "language:cy", "language:da", "language:de", "language:dv", "language:dz", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fo", "language:fr", "language:fur", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gu", "language:gv", "language:ha", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:ia", "language:id", "language:ig", "language:io", "language:is", "language:it", "language:ja", "language:jbo", "language:ka", "language:kg", "language:kk", "language:km", "language:kn", "language:ko", "language:kr", "language:ks", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:lo", "language:lt", "language:lv", "language:mai", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:mus", "language:my", "language:nb", "language:nds", "language:ne", "language:nhn", "language:nl", "language:nn", "language:no", "language:nqo", "language:nr", "language:nso", "language:oc", "language:or", "language:os", "language:pa", "language:pl", "language:ps", "language:pt", "language:quz", "language:ro", "language:ru", "language:rw", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:st", "language:sv", "language:sw", "language:szl", "language:ta", "language:te", "language:tg", "language:th", "language:tk", "language:tl", "language:tr", "language:ts", "language:tt", "language:tyj", "language:ug", "language:uk", "language:ur", "language:uz", "language:vi", "language:wa", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "opus_infopankki": {"dataset_name": "opus_infopankki", "description": "A parallel corpus of 12 languages, 66 bitexts.", "downloads": 9758, "configs": {"ar-en": {"config_name": "ar-en", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.en\": \"\\\"Basic information\\\"\"}", "columns": ["translation_ar", "translation_en"], "columns_mapping": {"translation.ar": "translation_ar", "translation.en": "translation_en"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-es": {"config_name": "ar-es", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\"}", "columns": ["translation_ar", "translation_es"], "columns_mapping": {"translation.ar": "translation_ar", "translation.es": "translation_es"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-et": {"config_name": "ar-et", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_ar", "translation_et"], "columns_mapping": {"translation.ar": "translation_ar", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fa": {"config_name": "ar-fa", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_ar", "translation_fa"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fi": {"config_name": "ar-fi", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.fi\": \"\\\"Perustietoa\\\"\"}", "columns": ["translation_ar", "translation_fi"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fr": {"config_name": "ar-fr", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_ar", "translation_fr"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-ru": {"config_name": "ar-ru", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.ru\": \"\\\"\\\\u041e\\\\u0441\\\\u043d\\\\u043e\\\\u0432\\\\u043d\\\\u0430\\\\u044f ...\"}", "columns": ["translation_ar", "translation_ru"], "columns_mapping": {"translation.ar": "translation_ar", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-so": {"config_name": "ar-so", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_ar", "translation_so"], "columns_mapping": {"translation.ar": "translation_ar", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-sv": {"config_name": "ar-sv", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.sv\": \"\\\"Historia Trafik\\\"\"}", "columns": ["translation_ar", "translation_sv"], "columns_mapping": {"translation.ar": "translation_ar", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-tr": {"config_name": "ar-tr", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_ar", "translation_tr"], "columns_mapping": {"translation.ar": "translation_ar", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-zh": {"config_name": "ar-zh", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_ar", "translation_zh"], "columns_mapping": {"translation.ar": "translation_ar", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-et": {"config_name": "en-et", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_en", "translation_et"], "columns_mapping": {"translation.en": "translation_en", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fa": {"config_name": "en-fa", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_en", "translation_fa"], "columns_mapping": {"translation.en": "translation_en", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.fi\": \"\\\"Avioliitto\\\"\"}", "columns": ["translation_en", "translation_fi"], "columns_mapping": {"translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.fr\": \"\\\"Mariage\\\"\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_en", "translation_ru"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-so": {"config_name": "en-so", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_en", "translation_so"], "columns_mapping": {"translation.en": "translation_en", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"translation.en\": \"\\\"Basic information\\\"\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_en", "translation_sv"], "columns_mapping": {"translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-tr": {"config_name": "en-tr", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_en", "translation_tr"], "columns_mapping": {"translation.en": "translation_en", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"Please select another language.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_en", "translation_zh"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-et": {"config_name": "es-et", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_es", "translation_et"], "columns_mapping": {"translation.es": "translation_es", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fa": {"config_name": "es-fa", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_es", "translation_fa"], "columns_mapping": {"translation.es": "translation_es", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fi": {"config_name": "es-fi", "sample_row": "{\"translation.es\": \"\\\"Todos los textos publicados en las p\\\\u00e1ginas w...\", \"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\"}", "columns": ["translation_es", "translation_fi"], "columns_mapping": {"translation.es": "translation_es", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_es", "translation_fr"], "columns_mapping": {"translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_es", "translation_ru"], "columns_mapping": {"translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-so": {"config_name": "es-so", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_es", "translation_so"], "columns_mapping": {"translation.es": "translation_es", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_es", "translation_sv"], "columns_mapping": {"translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-tr": {"config_name": "es-tr", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_es", "translation_tr"], "columns_mapping": {"translation.es": "translation_es", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-zh": {"config_name": "es-zh", "sample_row": "{\"translation.es\": \"\\\"Por favor seleccione otro idioma.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_es", "translation_zh"], "columns_mapping": {"translation.es": "translation_es", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fa": {"config_name": "et-fa", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav. Palu...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_et", "translation_fa"], "columns_mapping": {"translation.et": "translation_et", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fi": {"config_name": "et-fi", "sample_row": "{\"translation.et\": \"\\\"P\\\\u00f5hiteave\\\"\", \"translation.fi\": \"\\\"Perustietoa\\\"\"}", "columns": ["translation_et", "translation_fi"], "columns_mapping": {"translation.et": "translation_et", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fr": {"config_name": "et-fr", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_et", "translation_fr"], "columns_mapping": {"translation.et": "translation_et", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-ru": {"config_name": "et-ru", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_et", "translation_ru"], "columns_mapping": {"translation.et": "translation_et", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-so": {"config_name": "et-so", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_et", "translation_so"], "columns_mapping": {"translation.et": "translation_et", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-sv": {"config_name": "et-sv", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_et", "translation_sv"], "columns_mapping": {"translation.et": "translation_et", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-tr": {"config_name": "et-tr", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_et", "translation_tr"], "columns_mapping": {"translation.et": "translation_et", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-zh": {"config_name": "et-zh", "sample_row": "{\"translation.et\": \"\\\"Palun vali m\\\\u00f5ni teine keel.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_et", "translation_zh"], "columns_mapping": {"translation.et": "translation_et", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-fi": {"config_name": "fa-fi", "sample_row": "{\"translation.fa\": \"\\\"\\\\u062a\\\\u0645\\\\u0627\\\\u0645 \\\\u0645\\\\u0637\\\\u0627\\\\u0644...\", \"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\"}", "columns": ["translation_fa", "translation_fi"], "columns_mapping": {"translation.fa": "translation_fa", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-fr": {"config_name": "fa-fr", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_fa", "translation_fr"], "columns_mapping": {"translation.fa": "translation_fa", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-ru": {"config_name": "fa-ru", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_fa", "translation_ru"], "columns_mapping": {"translation.fa": "translation_fa", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-so": {"config_name": "fa-so", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_fa", "translation_so"], "columns_mapping": {"translation.fa": "translation_fa", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-sv": {"config_name": "fa-sv", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_fa", "translation_sv"], "columns_mapping": {"translation.fa": "translation_fa", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-tr": {"config_name": "fa-tr", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_fa", "translation_tr"], "columns_mapping": {"translation.fa": "translation_fa", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-zh": {"config_name": "fa-zh", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_fa", "translation_zh"], "columns_mapping": {"translation.fa": "translation_fa", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-fr": {"config_name": "fi-fr", "sample_row": "{\"translation.fi\": \"\\\"Avioliitto\\\"\", \"translation.fr\": \"\\\"Mariage\\\"\"}", "columns": ["translation_fi", "translation_fr"], "columns_mapping": {"translation.fi": "translation_fi", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-ru": {"config_name": "fi-ru", "sample_row": "{\"translation.fi\": \"\\\"Avioliitto\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_fi", "translation_ru"], "columns_mapping": {"translation.fi": "translation_fi", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-so": {"config_name": "fi-so", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.so\": \"\\\"Dhammaan qoraallada ka kooban luqadaha kala duwan...\"}", "columns": ["translation_fi", "translation_so"], "columns_mapping": {"translation.fi": "translation_fi", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-sv": {"config_name": "fi-sv", "sample_row": "{\"translation.fi\": \"\\\"Perustietoa\\\"\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_fi", "translation_sv"], "columns_mapping": {"translation.fi": "translation_fi", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-tr": {"config_name": "fi-tr", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.tr\": \"\\\"InfoFinland'\\\\u0131n internet sayfalar\\\\u0131nda hi...\"}", "columns": ["translation_fi", "translation_tr"], "columns_mapping": {"translation.fi": "translation_fi", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-zh": {"config_name": "fi-zh", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.zh\": \"\\\"InfoFinland\\\\u7f51\\\\u7ad9\\\\u4e0a\\\\u6240\\\\u6709\\\\u8bed\\\\u...\"}", "columns": ["translation_fi", "translation_zh"], "columns_mapping": {"translation.fi": "translation_fi", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"translation.fr\": \"\\\"Mariage\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_fr", "translation_ru"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-so": {"config_name": "fr-so", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_fr", "translation_so"], "columns_mapping": {"translation.fr": "translation_fr", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_fr", "translation_sv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-tr": {"config_name": "fr-tr", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_fr", "translation_tr"], "columns_mapping": {"translation.fr": "translation_fr", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-zh": {"config_name": "fr-zh", "sample_row": "{\"translation.fr\": \"\\\"Veuillez s\\\\u00e9lectionner une autre langue.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_fr", "translation_zh"], "columns_mapping": {"translation.fr": "translation_fr", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-so": {"config_name": "ru-so", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_ru", "translation_so"], "columns_mapping": {"translation.ru": "translation_ru", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-sv": {"config_name": "ru-sv", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041e\\\\u0441\\\\u043d\\\\u043e\\\\u0432\\\\u043d\\\\u0430\\\\u044f ...\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_ru", "translation_sv"], "columns_mapping": {"translation.ru": "translation_ru", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-tr": {"config_name": "ru-tr", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_ru", "translation_tr"], "columns_mapping": {"translation.ru": "translation_ru", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-zh": {"config_name": "ru-zh", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041f\\\\u043e\\\\u043f\\\\u0440\\\\u043e\\\\u0431\\\\u0443\\\\u0439\\\\...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_ru", "translation_zh"], "columns_mapping": {"translation.ru": "translation_ru", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-sv": {"config_name": "so-sv", "sample_row": "{\"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_so", "translation_sv"], "columns_mapping": {"translation.so": "translation_so", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-tr": {"config_name": "so-tr", "sample_row": "{\"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_so", "translation_tr"], "columns_mapping": {"translation.so": "translation_so", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-zh": {"config_name": "so-zh", "sample_row": "{\"translation.so\": \"\\\"Xulo luqad kale.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_so", "translation_zh"], "columns_mapping": {"translation.so": "translation_so", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "sv-tr": {"config_name": "sv-tr", "sample_row": "{\"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_sv", "translation_tr"], "columns_mapping": {"translation.sv": "translation_sv", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "sv-zh": {"config_name": "sv-zh", "sample_row": "{\"translation.sv\": \"\\\"V\\\\u00e4lj n\\\\u00e5got annat spr\\\\u00e5k.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_sv", "translation_zh"], "columns_mapping": {"translation.sv": "translation_sv", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "tr-zh": {"config_name": "tr-zh", "sample_row": "{\"translation.tr\": \"\\\"L\\\\u00fctfen, ba\\\\u015fka bir dil se\\\\u00e7iniz.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_tr", "translation_zh"], "columns_mapping": {"translation.tr": "translation_tr", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ar", "language:en", "language:es", "language:et", "language:fa", "language:fi", "language:fr", "language:ru", "language:so", "language:sv", "language:tr", "language:zh"], "is_gated": false}, "opus_memat": {"dataset_name": "opus_memat", "description": "Xhosa-English parallel corpora, funded by EPSRC, the Medical Machine Translation project worked on machine translation between ixiXhosa and English, with a focus on the medical domain.", "downloads": 285, "configs": {"xh-en": {"config_name": "xh-en", "sample_row": "{\"translation.xh\": \"\\\"Kwathi emva kokufa kukaSawule, uDavide ebuyile ek...\", \"translation.en\": \"\\\"It happened after the death of Saul, when David w...\"}", "columns": ["translation_xh", "translation_en"], "columns_mapping": {"translation.xh": "translation_xh", "translation.en": "translation_en"}, "dataset_description": "Xhosa-English parallel corpora, funded by EPSRC, the Medical Machine Translation project worked on machine translation between ixiXhosa and English, with a focus on the medical domain.", "dataset_name": "opus_memat"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:xh"], "is_gated": false}, "opus_montenegrinsubs": {"dataset_name": "opus_montenegrinsubs", "description": "Opus MontenegrinSubs dataset for machine translation task, for language pair en-me: english and montenegrin", "downloads": 285, "configs": {"en-me": {"config_name": "en-me", "sample_row": "{\"translation.en\": \"\\\"Season 1 Episode 1 Pilot (Dimension)\\\"\", \"translation.me\": \"\\\"OPASNE IGRE Pilot epizoda\\\"\"}", "columns": ["translation_en", "translation_me"], "columns_mapping": {"translation.en": "translation_en", "translation.me": "translation_me"}, "dataset_description": "Opus MontenegrinSubs dataset for machine translation task, for language pair en-me: english and montenegrin\n", "dataset_name": "opus_montenegrinsubs"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:cnr", "language:en"], "is_gated": false}, "opus_openoffice": {"dataset_name": "opus_openoffice", "description": "A collection of documents from http://www.openoffice.org/.", "downloads": 4103, "configs": {"de-en_GB": {"config_name": "de-en_GB", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\"}", "columns": ["translation_de", "translation_en_GB"], "columns_mapping": {"translation.de": "translation_de", "translation.en_GB": "translation_en_GB"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-es": {"config_name": "de-es", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\"}", "columns": ["translation_de", "translation_es"], "columns_mapping": {"translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_de", "translation_fr"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-ja": {"config_name": "de-ja", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_de", "translation_ja"], "columns_mapping": {"translation.de": "translation_de", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-ru": {"config_name": "de-ru", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_de", "translation_ru"], "columns_mapping": {"translation.de": "translation_de", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-sv": {"config_name": "de-sv", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_de", "translation_sv"], "columns_mapping": {"translation.de": "translation_de", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-zh_CN": {"config_name": "de-zh_CN", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_de", "translation_zh_CN"], "columns_mapping": {"translation.de": "translation_de", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-es": {"config_name": "en_GB-es", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_es"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.es": "translation_es"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-fr": {"config_name": "en_GB-fr", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_fr"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-ja": {"config_name": "en_GB-ja", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_en_GB", "translation_ja"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-ru": {"config_name": "en_GB-ru", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_en_GB", "translation_ru"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-sv": {"config_name": "en_GB-sv", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_sv"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-zh_CN": {"config_name": "en_GB-zh_CN", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_en_GB", "translation_zh_CN"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_es", "translation_fr"], "columns_mapping": {"translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-ja": {"config_name": "es-ja", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_es", "translation_ja"], "columns_mapping": {"translation.es": "translation_es", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_es", "translation_ru"], "columns_mapping": {"translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_es", "translation_sv"], "columns_mapping": {"translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-zh_CN": {"config_name": "es-zh_CN", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_es", "translation_zh_CN"], "columns_mapping": {"translation.es": "translation_es", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-ja": {"config_name": "fr-ja", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_fr", "translation_ja"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_fr", "translation_ru"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_fr", "translation_sv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-zh_CN": {"config_name": "fr-zh_CN", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_fr", "translation_zh_CN"], "columns_mapping": {"translation.fr": "translation_fr", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-ru": {"config_name": "ja-ru", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_ja", "translation_ru"], "columns_mapping": {"translation.ja": "translation_ja", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-sv": {"config_name": "ja-sv", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_ja", "translation_sv"], "columns_mapping": {"translation.ja": "translation_ja", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-zh_CN": {"config_name": "ja-zh_CN", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_ja", "translation_zh_CN"], "columns_mapping": {"translation.ja": "translation_ja", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ru-sv": {"config_name": "ru-sv", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_ru", "translation_sv"], "columns_mapping": {"translation.ru": "translation_ru", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ru-zh_CN": {"config_name": "ru-zh_CN", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_ru", "translation_zh_CN"], "columns_mapping": {"translation.ru": "translation_ru", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "sv-zh_CN": {"config_name": "sv-zh_CN", "sample_row": "{\"translation.sv\": \"\\\"Diagram i $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_sv", "translation_zh_CN"], "columns_mapping": {"translation.sv": "translation_sv", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:ja", "language:ru", "language:sv", "language:zh"], "is_gated": false}, "opus_paracrawl": {"dataset_name": "opus_paracrawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G", "downloads": 1592, "configs": {"el-en": {"config_name": "el-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"\\\\u03a3\\\\u03a5\\\\u039c\\\\u039c\\\\u0395\\\\u03a4\\\\u039f\\\\u03a7\\\\...\", \"translation.en\": \"\\\"PARTICIPATION IN THE NATIONAL CONFERENCE OF NORTH...\"}", "columns": ["id", "translation_el", "translation_en"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-km": {"config_name": "en-km", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"INCLUDED IN THIS BOOK ARE SECTIONS 103, 104, 105,...\", \"translation.km\": \"\\\"\\\\u179a\\\\u17bd\\\\u1798 \\\\u1794\\\\u1789\\\\u17d2\\\\u1785\\\\u17bc...\"}", "columns": ["id", "translation_en", "translation_km"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.km": "translation_km"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-so": {"config_name": "en-so", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"11 And the smoke of their torments shall ascend u...\", \"translation.so\": \"\\\"11 Oo qiiqa caddibaaddooda kor buu u baxayaa weli...\"}", "columns": ["id", "translation_en", "translation_so"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.so": "translation_so"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "de-pl": {"config_name": "de-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"126:2.5 (1388.5) Im Laufe der Jahre ma\\\\u00df dies...\", \"translation.pl\": \"\\\"(1388.5) 126:2.5 Z up\\\\u0142ywem lat m\\\\u0142ody ci...\"}", "columns": ["id", "translation_de", "translation_pl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.pl": "translation_pl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "fr-nl": {"config_name": "fr-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"3 Le sucre peut mettre fin \\\\u00e0 la grossesse?\\\"...\", \"translation.nl\": \"\\\"3 Kan suiker be\\\\u00ebindigen van de zwangerschap?...\"}", "columns": ["id", "translation_fr", "translation_nl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.nl": "translation_nl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-sw": {"config_name": "en-sw", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"1. We have all sinned and are therefore separated...\", \"translation.sw\": \"\\\"\\\\u2022 Wote tumetenda dhambi na kwa hivyo tumeten...\"}", "columns": ["id", "translation_en", "translation_sw"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sw": "translation_sw"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-tl": {"config_name": "en-tl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Because police would not allow amplification syst...\", \"translation.tl\": \"\\\"Dahil ang pulis hindi papayagan ang paglaki mga s...\"}", "columns": ["id", "translation_en", "translation_tl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.tl": "translation_tl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "es-gl": {"config_name": "es-gl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Esta vez el cuadro local con el segundo de la tar...\", \"translation.gl\": \"\\\"Ga\\\\u00f1ou confianza o cadro local co segundo da ...\"}", "columns": ["id", "translation_es", "translation_gl"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.gl": "translation_gl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:ga", "language:gl", "language:hr", "language:hu", "language:is", "language:it", "language:km", "language:ko", "language:lt", "language:lv", "language:mt", "language:my", "language:nb", "language:ne", "language:nl", "language:nn", "language:pl", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:so", "language:sv", "language:sw", "language:tl", "language:uk", "language:zh"], "is_gated": false}, "opus_rf": {"dataset_name": "opus_rf", "description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.", "downloads": 1554, "configs": {"de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-es": {"config_name": "de-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"MINISTERIUM DES AUSW\\\\u00c4RTIGEN Presseabteilung\\\"...\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\"}", "columns": ["id", "translation_de", "translation_es"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-sv": {"config_name": "de-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_de", "translation_sv"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"MINISTRY FOR FOREIGN AFFAIRS Press Section Check ...\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_en", "translation_sv"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\", \"translation.fr\": \"\\\"MINISTERE DES AFFAIRES \\\\u00c9TRANGERES Service de...\"}", "columns": ["id", "translation_es", "translation_fr"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\", \"translation.sv\": \"\\\"Fru talman, \\\\u00e4rade ledam\\\\u00f6ter av Sveriges...\"}", "columns": ["id", "translation_es", "translation_sv"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_fr", "translation_sv"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:sv"], "is_gated": false}, "opus_tedtalks": {"dataset_name": "opus_tedtalks", "description": "This is a Croatian-English parallel corpus of transcribed and translated TED talks, originally extracted from https://wit3.fbk.eu. The corpus is compiled by \u017deljko Agi\u0107 and is taken from http://lt.ffzg.hr/zagic provided under the CC-BY-NC-SA license.\n2 languages, total number of files: 2\ntotal number of tokens: 2.81M\ntotal number of sentence fragments: 0.17M", "downloads": 303, "configs": {"en-hr": {"config_name": "en-hr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"I want you now to imagine a wearable robot that g...\", \"translation.hr\": \"\\\"\\\\u017delim da sada zamislite nosiv robot koji vam...\"}", "columns": ["id", "translation_en", "translation_hr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hr": "translation_hr"}, "dataset_description": "This is a Croatian-English parallel corpus of transcribed and translated TED talks, originally extracted from https://wit3.fbk.eu. The corpus is compiled by \u017deljko Agi\u0107 and is taken from http://lt.ffzg.hr/zagic provided under the CC-BY-NC-SA license.\n2 languages, total number of files: 2\ntotal number of tokens: 2.81M\ntotal number of sentence fragments: 0.17M\n", "dataset_name": "opus_tedtalks"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:hr"], "is_gated": false}, "opus_ubuntu": {"dataset_name": "opus_ubuntu", "description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M", "downloads": 1555, "configs": {"as-bs": {"config_name": "as-bs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.as\": \"\\\"Aisleriot \\\\u099a\\\\u09b2\\\\u09bf\\\\u099f\\\\u09c7\\\\u09df\\\\u0...\", \"translation.bs\": \"\\\"AisleRiot pasijans\\\"\"}", "columns": ["id", "translation_as", "translation_bs"], "columns_mapping": {"id": "id", "translation.as": "translation_as", "translation.bs": "translation_bs"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "az-cs": {"config_name": "az-cs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.az\": \"\\\"Gmail, Google Docs, Google+ , YouTube v\\\\u0259 Pic...\", \"translation.cs\": \"\\\"Obashuje Gmail, Google Docs, Google+, YouTube a P...\"}", "columns": ["id", "translation_az", "translation_cs"], "columns_mapping": {"id": "id", "translation.az": "translation_az", "translation.cs": "translation_cs"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bg-de": {"config_name": "bg-de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u0412\\\\u043a\\\\u043b\\\\u044e\\\\u0447\\\\u0432\\\\u0430 Gmail,...\", \"translation.de\": \"\\\"Umfasst Gmail, Google Docs, Google+, YouTube und ...\"}", "columns": ["id", "translation_bg", "translation_de"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.de": "translation_de"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-es_PR": {"config_name": "br-es_PR", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"N'eo c'hall ket beza\\\\u00f1 an arguzenn %s evit %s...\", \"translation.es_PR\": \"\\\"argumento %s inv\\\\u00e1lido para %s\\\"\"}", "columns": ["id", "translation_br", "translation_es_PR"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.es_PR": "translation_es_PR"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bn-ga": {"config_name": "bn-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bn\": \"\\\"AisleRiot \\\\u09b8\\\\u09b2\\\\u09bf\\\\u099f\\\\u09c7\\\\u09df\\\\u0...\", \"translation.ga\": \"\\\"Cluiche Aonair AisleRiot\\\"\"}", "columns": ["id", "translation_bn", "translation_ga"], "columns_mapping": {"id": "id", "translation.bn": "translation_bn", "translation.ga": "translation_ga"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-hi": {"config_name": "br-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Enderc'hel a ra Gmail, Google Docs, Google+, YouT...\", \"translation.hi\": \"\\\"\\\\u0936\\\\u093e\\\\u092e\\\\u093f\\\\u0932 \\\\u0915\\\\u0930\\\\u0924...\"}", "columns": ["id", "translation_br", "translation_hi"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.hi": "translation_hi"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-la": {"config_name": "br-la", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Mat eo\\\"\", \"translation.la\": \"\\\"Bene\\\"\"}", "columns": ["id", "translation_br", "translation_la"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.la": "translation_la"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bs-szl": {"config_name": "bs-szl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bs\": \"\\\"Uredu\\\"\", \"translation.szl\": \"\\\"OK\\\"\"}", "columns": ["id", "translation_bs", "translation_szl"], "columns_mapping": {"id": "id", "translation.bs": "translation_bs", "translation.szl": "translation_szl"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-uz": {"config_name": "br-uz", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Kas an danevell a-fet kudenno\\\\u00f9 da baotred an...\", \"translation.uz\": \"\\\"\\\\u0422\\\\u0443\\\\u0437\\\\u0443\\\\u0432\\\\u0447\\\\u0438\\\\u043b\\\\...\"}", "columns": ["id", "translation_br", "translation_uz"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.uz": "translation_uz"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-yi": {"config_name": "br-yi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Mat eo\\\"\", \"translation.yi\": \"\\\"\\\\u05d2\\\\u05d5\\\\u05d8\\\"\"}", "columns": ["id", "translation_br", "translation_yi"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.yi": "translation_yi"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}}, "tags": ["task_categories:translation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:ace", "language:af", "language:ak", "language:am", "language:an", "language:ang", "language:ar", "language:ary", "language:as", "language:ast", "language:az", "language:ba", "language:bal", "language:be", "language:bem", "language:ber", "language:bg", "language:bho", "language:bn", "language:bo", "language:br", "language:brx", "language:bs", "language:bua", "language:byn", "language:ca", "language:ce", "language:ceb", "language:chr", "language:ckb", "language:co", "language:crh", "language:cs", "language:csb", "language:cv", "language:cy", "language:da", "language:de", "language:dsb", "language:dv", "language:dz", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:ff", "language:fi", "language:fil", "language:fo", "language:fr", "language:frm", "language:frp", "language:fur", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:grc", "language:gu", "language:guc", "language:gv", "language:ha", "language:haw", "language:he", "language:hi", "language:hil", "language:hne", "language:hr", "language:hsb", "language:ht", "language:hu", "language:hy", "language:ia", "language:id", "language:ig", "language:io", "language:is", "language:it", "language:iu", "language:ja", "language:jbo", "language:jv", "language:ka", "language:kab", "language:kg", "language:kk", "language:kl", "language:km", "language:kn", "language:ko", "language:kok", "language:ks", "language:ksh", "language:ku", "language:kw", "language:ky", "language:la", "language:lb", "language:lg", "language:li", "language:lij", "language:lld", "language:ln", "language:lo", "language:lt", "language:ltg", "language:lv", "language:mai", "language:mg", "language:mh", "language:mhr", "language:mi", "language:miq", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:mus", "language:my", "language:nan", "language:nap", "language:nb", "language:nds", "language:ne", "language:nhn", "language:nl", "language:nn", "language:no", "language:nso", "language:ny", "language:oc", "language:om", "language:or", "language:os", "language:pa", "language:pam", "language:pap", "language:pl", "language:pms", "language:pmy", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:rom", "language:ru", "language:rw", "language:sa", "language:sc", "language:sco", "language:sd", "language:se", "language:shn", "language:shs", "language:si", "language:sk", "language:sl", "language:sm", "language:sml", "language:sn", "language:so", "language:son", "language:sq", "language:sr", "language:st", "language:sv", "language:sw", "language:syr", "language:szl", "language:ta", "language:te", "language:tet", "language:tg", "language:th", "language:ti", "language:tk", "language:tl", "language:tlh", "language:tr", "language:trv", "language:ts", "language:tt", "language:ug", "language:uk", "language:ur", "language:uz", "language:ve", "language:vec", "language:vi", "language:wa", "language:wae", "language:wo", "language:xal", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu", "language:zza"], "is_gated": false}, "opus_wikipedia": {"dataset_name": "opus_wikipedia", "description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M", "downloads": 1114, "configs": {"ar-en": {"config_name": "ar-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"* Encyclopaedia of Mathematics online encyclopaed...\", \"translation.en\": \"\\\"*Encyclopaedia of Mathematics online encyclopaedi...\"}", "columns": ["id", "translation_ar", "translation_en"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.en": "translation_en"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "ar-pl": {"config_name": "ar-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"\\\\u0646\\\\u0638\\\\u0627\\\\u0645 \\\\u062a\\\\u0631\\\\u0645\\\\u064a...\", \"translation.pl\": \"\\\"ASCII (czyt.\\\"\"}", "columns": ["id", "translation_ar", "translation_pl"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.pl": "translation_pl"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-sl": {"config_name": "en-sl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"__NOTOC__Year 886 (DCCCLXXXVI) was a common year ...\", \"translation.sl\": \"\\\"\\\\u017divel je predvsem v Bagdadu.\\\"\"}", "columns": ["id", "translation_en", "translation_sl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sl": "translation_sl"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Average temperatures on the coast are in January ...\", \"translation.ru\": \"\\\"\\\\u0427\\\\u0435\\\\u0440\\\\u0435\\\\u0437 \\\\u043d\\\\u0435\\\\u0434...\"}", "columns": ["id", "translation_en", "translation_ru"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-vi": {"config_name": "en-vi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"A. Blackburn (2009).\\\"\", \"translation.vi\": \"\\\"A. Blackburn (2009).\\\"\"}", "columns": ["id", "translation_en", "translation_vi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.vi": "translation_vi"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ar", "language:bg", "language:cs", "language:de", "language:el", "language:en", "language:es", "language:fa", "language:fr", "language:he", "language:hu", "language:it", "language:nl", "language:pl", "language:pt", "language:ro", "language:ru", "language:sl", "language:tr", "language:vi"], "is_gated": false}, "opus_xhosanavy": {"dataset_name": "opus_xhosanavy", "description": "This dataset is designed for machine translation from English to Xhosa.", "downloads": 349, "configs": {"en-xh": {"config_name": "en-xh", "sample_row": "{\"translation.en\": \"\\\"Rope and its Usage\\\"\", \"translation.xh\": \"\\\"Intambo nomsebenzi ewenzayo.\\\"\"}", "columns": ["translation_en", "translation_xh"], "columns_mapping": {"translation.en": "translation_en", "translation.xh": "translation_xh"}, "dataset_description": "This dataset is designed for machine translation from English to Xhosa.", "dataset_name": "opus_xhosanavy"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:xh"], "is_gated": false}, "oscar": {"dataset_name": "oscar", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "downloads": 57575, "configs": {"unshuffled_deduplicated_af": {"config_name": "unshuffled_deduplicated_af", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"aanlyn markte as gevolg van ons voortgesette 'n b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_als": {"config_name": "unshuffled_deduplicated_als", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dr 6. Augschte isch dr 218. Dag vum Gregorianisch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_am": {"config_name": "unshuffled_deduplicated_am", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u12a0\\\\u12e8\\\\u122d \\\\u1218\\\\u1295\\\\u1308\\\\u12f1 \\\\u12a...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_an": {"config_name": "unshuffled_deduplicated_an", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0648\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ar": {"config_name": "unshuffled_deduplicated_ar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0645\\\\u0631\\\\u062d\\\\u0628\\\\u0627 \\\\u0628\\\\u0643 \\\\u063...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_arz": {"config_name": "unshuffled_deduplicated_arz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"..\\\\u064c::\\\\u064c:: \\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0627...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_as": {"config_name": "unshuffled_deduplicated_as", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0986\\\\u09ae\\\\u09bf, \\\\u098f\\\\u0987 \\\\u09b8\\\\u0982\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ast": {"config_name": "unshuffled_deduplicated_ast", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"The Killers llanzaron el so \\\\u00e1lbum deb\\\\u00fa,...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_av": {"config_name": "unshuffled_deduplicated_av", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0438\\\\u043d\\\\u0434\\\\u0430 \\\\u043c\\\\u0430\\\\u043b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_az": {"config_name": "unshuffled_deduplicated_az", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"AZTV-Art\\\\u0131q 7 ildir ki, Ab\\\\u015feron rayonu d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_azb": {"config_name": "unshuffled_deduplicated_azb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0644\\\\u0639\\\\u0644\\\\u06cc \\\\u0661\\\\u0663-\\\\u062c\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ba": {"config_name": "unshuffled_deduplicated_ba", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u04d9\\\\u0441\\\\u0435\\\\u0440 \\\\u043c\\\\u0430\\\\u0442...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bar": {"config_name": "unshuffled_deduplicated_bar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\" ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bcl": {"config_name": "unshuffled_deduplicated_bcl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"& \\\\u00ff \\\\u00f3 / \\\\u00ed 0 - \\\\u00f8 \\\\u00fb \\\\u00f9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_be": {"config_name": "unshuffled_deduplicated_be", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0440\\\\u044d\\\\u0441\\\\u0446\\\\u043a\\\\u0456\\\\u044f ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bg": {"config_name": "unshuffled_deduplicated_bg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0410\\\\u041b\\\\u0411\\\\u041e\\\\u041f\\\\u041e\\\\u0414\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bh": {"config_name": "unshuffled_deduplicated_bh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0908 \\\\u0938\\\\u0947\\\\u0939\\\\u0924 \\\\u0906 \\\\u0938\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bn": {"config_name": "unshuffled_deduplicated_bn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09ad\\\\u09dc\\\\u0982 \\\\u09b8\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09b8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bo": {"config_name": "unshuffled_deduplicated_bo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0f56\\\\u0f7c\\\\u0f51\\\\u0f0b\\\\u0f58\\\\u0f72\\\\u0f0b\\\\u0f60\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bpy": {"config_name": "unshuffled_deduplicated_bpy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09aa\\\\u09cc\\\\u09b0\\\\u09b8\\\\u09ad\\\\u09be \\\\u098f\\\\u09b9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_br": {"config_name": "unshuffled_deduplicated_br", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Ar mank Magalh\\\\u00e3es(Daveo\\\\u00f9 a vank) a zo u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bs": {"config_name": "unshuffled_deduplicated_bs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u017e \\\\u0161\\\\u0159 \\\\u00e9 \\\\u00fa \\\\u0161\\\\u0159 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bxr": {"config_name": "unshuffled_deduplicated_bxr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0421\\\\u0430\\\\u0433\\\\u0430\\\\u0430\\\\u043d h\\\\u0430\\\\u044...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ca": {"config_name": "unshuffled_deduplicated_ca", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Daniel Vendrell, conegut com Vandrell, ha sigut u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cbk": {"config_name": "unshuffled_deduplicated_cbk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"yo gano yo gano yo gano yo gano yo gano yo gano y...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ce": {"config_name": "unshuffled_deduplicated_ce", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0413\\\\u0440\\\\u0435\\\\u043d\\\\u043b\\\\u0430\\\\u043d\\\\u0434\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ceb": {"config_name": "unshuffled_deduplicated_ceb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Si Isko walay pupamilok nga nagtan-aw sa unahan, ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ckb": {"config_name": "unshuffled_deduplicated_ckb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0631\\\\u0633\\\\u06cc \\\\u0631\\\\u06c6\\\\u0698 - \\\\u0633\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cs": {"config_name": "unshuffled_deduplicated_cs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oran\\\\u017eovou stuhu 2018 z\\\\u00edskala od Ministe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cv": {"config_name": "unshuffled_deduplicated_cv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u044b\\\\u0440\\\\u0430\\\\u043d\\\\u04d1 \\\\u0447\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cy": {"config_name": "unshuffled_deduplicated_cy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mae capeli Cymreig yr Andes ym Mhatagonia wedi cy...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_da": {"config_name": "unshuffled_deduplicated_da", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Den 2.-5. februar 2016 l\\\\u00f8b det tredje kursus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_de": {"config_name": "unshuffled_deduplicated_de", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dosierf\\\\u00f6rderb\\\\u00e4nder Getriebe Entw\\\\u00e4s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_diq": {"config_name": "unshuffled_deduplicated_diq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Z\\\\u0131wan\\\\u00ea Slawki, z\\\\u0131wano merduman\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_dsb": {"config_name": "unshuffled_deduplicated_dsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zg\\\\u00f3\\\\u0144\\\\u015bo w\\\\u011bcej w\\\\u00f3 l\\\\u011bp...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_dv": {"config_name": "unshuffled_deduplicated_dv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0784. \\\\u0787\\\\u07a6\\\\u078c\\\\u07ae\\\\u0785\\\\u07aa\\\\u078...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_el": {"config_name": "unshuffled_deduplicated_el", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u039d\\\\u03b5\\\\u03ba\\\\u03c1\\\\u03cc\\\\u03c2 \\\\u03b5\\\\u03bd...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eml": {"config_name": "unshuffled_deduplicated_eml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"A s\\\\u00e9guit dal pruc\\\\u00e8s ad rubuti\\\\u015basi\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_en": {"config_name": "unshuffled_deduplicated_en", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mtendere Village was inspired by the vision of Ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eo": {"config_name": "unshuffled_deduplicated_eo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0108u ... pre\\\\u011di | mediti | ricevi instigoj...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_es": {"config_name": "unshuffled_deduplicated_es", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Como se librar\\\\u00e1 de la celulitis en el gimnas...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_et": {"config_name": "unshuffled_deduplicated_et", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"MT\\\\u00dc AB Video j\\\\u00e4rgib oma tegevuses kodan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eu": {"config_name": "unshuffled_deduplicated_eu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Gure jarduerek eraikuntzarekin, elkarbizitzarekin...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fa": {"config_name": "unshuffled_deduplicated_fa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0645\\\\u0634\\\\u0628 \\\\u0628\\\\u0627\\\\u0631\\\\u0648...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fi": {"config_name": "unshuffled_deduplicated_fi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Luokka Kauniita tytt\\\\u00f6j\\\\u00e4, Teini, Porno p...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fr": {"config_name": "unshuffled_deduplicated_fr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"M\\\\u00e9dia de d\\\\u00e9bat d'id\\\\u00e9es, de culture...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_frr": {"config_name": "unshuffled_deduplicated_frr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hiragana\\\\u2019 Practice\\\\u2019Sheet\\\\u20191\\\\u2019(A...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fy": {"config_name": "unshuffled_deduplicated_fy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Veen, Klaas F. van der et al1984-2011Wurdboek fan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ga": {"config_name": "unshuffled_deduplicated_ga", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Is f\\\\u00f3ram \\\\u00e9 seo chun pl\\\\u00e9 a dh\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gd": {"config_name": "unshuffled_deduplicated_gd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zhou Yujun, a 'ph\\\\u00e0rtaidh R\\\\u00f9naire Comata...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gl": {"config_name": "unshuffled_deduplicated_gl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"O persoal de Inditex da provincia de Pontevedra s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gn": {"config_name": "unshuffled_deduplicated_gn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oiko pete\\\\u0129 kirir\\\\u0129 \\\\u00f1emond\\\\u00fdi pe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gom": {"config_name": "unshuffled_deduplicated_gom", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u091c\\\\u093e\\\\u0915\\\\u0902\\\\u0920\\\\u0940\\\\u0902\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gu": {"config_name": "unshuffled_deduplicated_gu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a85\\\\u0aa7\\\\u0abf\\\\u0a95 \\\\u0aae\\\\u0abe\\\\u0ab8 \\\\u0a9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_he": {"config_name": "unshuffled_deduplicated_he", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05d6\\\\u05e7\\\\u05d5\\\\u05e7\\\\u05d9\\\\u05dd \\\\u05dc\\\\u05e8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hi": {"config_name": "unshuffled_deduplicated_hi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'\\\\u0906\\\\u0907\\\\u091f\\\\u092e \\\\u0917\\\\u0930\\\\u094d\\\\u093...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hr": {"config_name": "unshuffled_deduplicated_hr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"U raspravi je sudjelovao i HSS-ov saborski zastup...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hsb": {"config_name": "unshuffled_deduplicated_hsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Budy\\\\u0161in (SN/B\\\\u0160e). Elektronikarjo m\\\\u011...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ht": {"config_name": "unshuffled_deduplicated_ht", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u043d\\\\u0430\\\\u0447\\\\u0430\\\\u0442\\\\u044c us $ nan us ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hu": {"config_name": "unshuffled_deduplicated_hu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"monster - Amat\\\\u0151r, h\\\\u00e1zi szex vide\\\\u00f3k...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hy": {"config_name": "unshuffled_deduplicated_hy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0531\\\\u0580\\\\u0581\\\\u0561\\\\u056d\\\\u056b \\\\u0540\\\\u0561...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ia": {"config_name": "unshuffled_deduplicated_ia", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha h...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_id": {"config_name": "unshuffled_deduplicated_id", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"~pic by cetusanminda. Marhalah yang sering disebu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ie": {"config_name": "unshuffled_deduplicated_ie", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Plastic Yo Yo Metal Yo Yos Wooden Yo Yo Keychain ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ilo": {"config_name": "unshuffled_deduplicated_ilo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Segun ken ni Ping-ay, ti yellow corn ti maysa kad...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_io": {"config_name": "unshuffled_deduplicated_io", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Chekia esas parlamentala republiko. La chefo di s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_is": {"config_name": "unshuffled_deduplicated_is", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Eyjar.net - uppl\\\\u00fdsinga- og fr\\\\u00e9ttami\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_it": {"config_name": "unshuffled_deduplicated_it", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La estrazione numero 48 del 10 e LOTTO ogni 5 min...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ja": {"config_name": "unshuffled_deduplicated_ja", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u795e\\\\u793e\\\\u306a\\\\u3069\\\\u3078\\\\u4e00\\\\u7dd2\\\\u306b\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_jbo": {"config_name": "unshuffled_deduplicated_jbo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"lo ni lo vacri cu glare cu banzuni lo nu ro da po...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_jv": {"config_name": "unshuffled_deduplicated_jv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Jos\\\\u00e9 Mourinho (diwaca: [\\\\u0292u\\\\u02c8z\\\\u025b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ka": {"config_name": "unshuffled_deduplicated_ka", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10ec\\\\u10d0\\\\u10db\\\\u10d8\\\\u10e7\\\\u10d5\\\\u10d0\\\\u10dc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kk": {"config_name": "unshuffled_deduplicated_kk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0422\\\\u04af\\\\u043b\\\\u043a\\\\u0456\\\\u0431\\\\u0430\\\\u0441 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_km": {"config_name": "unshuffled_deduplicated_km", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u1781\\\\u17d2\\\\u179f\\\\u17b9\\\\u1794\\\\u178a\\\\u17b6\\\\u1780\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kn": {"config_name": "unshuffled_deduplicated_kn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0caa\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ko": {"config_name": "unshuffled_deduplicated_ko", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"CIA \\\\ud504\\\\ub85c\\\\uc81d\\\\ud2b8\\\\uc5d0\\\\uc11c\\\\ub294 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_krc": {"config_name": "unshuffled_deduplicated_krc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u0430\\\\u043c\\\\u0445\\\\u0430\\\\u043d\\\\u043b\\\\u0430\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ku": {"config_name": "unshuffled_deduplicated_ku", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"R\\\\u00eaxistina maf\\\\u00ean mirovan Freedom House r...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kv": {"config_name": "unshuffled_deduplicated_kv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u043e\\\\u043c\\\\u0438 \\\\u043a\\\\u044b\\\\u0442\\\\u0448...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kw": {"config_name": "unshuffled_deduplicated_kw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ky": {"config_name": "unshuffled_deduplicated_ky", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Turmush: \\\\u0411\\\\u0438\\\\u0448\\\\u043a\\\\u0435\\\\u043a \\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_la": {"config_name": "unshuffled_deduplicated_la", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"H\\\\u00e6 sunt generationes No\\\\u00eb: No\\\\u00eb vir ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lb": {"config_name": "unshuffled_deduplicated_lb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"W\\\\u00e9i all Joers ass d'Fuesend nees eng m\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lez": {"config_name": "unshuffled_deduplicated_lez", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0441, \\\\u043b\\\\u0435\\\\u0437\\\\u0433\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_li": {"config_name": "unshuffled_deduplicated_li", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'t Good Goedenraad aan de Ezerbaek besjteit oet '...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lmo": {"config_name": "unshuffled_deduplicated_lmo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Calvens\\\\u00e0 l'\\\\u00e8 a 24 km de la sit\\\\u00e0 e ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lo": {"config_name": "unshuffled_deduplicated_lo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0ea7\\\\u0eb5\\\\u200b\\\\u0ec2\\\\u0ead\\\\u200b\\\\u0ec0\\\\u0ead\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lrc": {"config_name": "unshuffled_deduplicated_lrc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0631\\\\u0644\\\\u06cc\\\\u0646\\\\u06af\\\\u062a\\\\u0648\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lt": {"config_name": "unshuffled_deduplicated_lt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0160i programa pad\\\\u0117s geriau i\\\\u0161mokti i...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lv": {"config_name": "unshuffled_deduplicated_lv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Divdesmit pirmaj\\\\u0101 apr\\\\u012bl\\\\u012b m\\\\u016bsu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mai": {"config_name": "unshuffled_deduplicated_mai", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u096e \\\\u0938\\\\u093f\\\\u0924\\\\u092e\\\\u094d\\\\u092c\\\\u0930...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mg": {"config_name": "unshuffled_deduplicated_mg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hijery ny Tenim-Pirenena rehetra? Mandika ny tant...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mhr": {"config_name": "unshuffled_deduplicated_mhr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u043a\\\\u0440\\\\u0435\\\\u0442 \\\\u0436\\\\u0430\\\\u043f...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_min": {"config_name": "unshuffled_deduplicated_min", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"holaholaholaholaholaholaholaholaholaholaholaholah...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mk": {"config_name": "unshuffled_deduplicated_mk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201e\\\\u0424\\\\u0438\\\\u043b\\\\u043c \\\\u043f\\\\u043b\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ml": {"config_name": "unshuffled_deduplicated_ml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0d05\\\\u0d38\\\\u0d2d\\\\u0d4d\\\\u0d2f\\\\u0d35\\\\u0d41\\\\u0d02 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mn": {"config_name": "unshuffled_deduplicated_mn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041c\\\\u0423\\\\u0411\\\\u0418\\\\u0421-\\\\u044b\\\\u043d \\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mr": {"config_name": "unshuffled_deduplicated_mr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Home / motivational marathi story / \\\\u0909\\\\u0926\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mrj": {"config_name": "unshuffled_deduplicated_mrj", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041b\\\\u04f9\\\\u043f\\\\u04f9\\\\u0432\\\\u043b\\\\u04d3 (\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ms": {"config_name": "unshuffled_deduplicated_ms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Suhaib memang antara orang yang aktif berprogram....\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mt": {"config_name": "unshuffled_deduplicated_mt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"tibg\\\\u0127at il-kaw\\\\u017ca lura lill-Qorti \\\\u0120...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mwl": {"config_name": "unshuffled_deduplicated_mwl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Deciplina social i out\\\\u00f3noma que angloba ateb...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_my": {"config_name": "unshuffled_deduplicated_my", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u103b\\\\u1019\\\\u1040\\\\u1010\\\\u102e - \\\\u101b\\\\u1014\\\\u10...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_myv": {"config_name": "unshuffled_deduplicated_myv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0427\\\\u0430\\\\u0447\\\\u0441\\\\u044c 1914 \\\\u0443\\\\u043c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mzn": {"config_name": "unshuffled_deduplicated_mzn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0642\\\\u0631\\\\u0622\\\\u0646 \\\\u06cc\\\\u0627 \\\\u0642\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nah": {"config_name": "unshuffled_deduplicated_nah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"In m\\\\u0101cu\\\\u012blp\\\\u014dhualxihuitl VI (inic ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nap": {"config_name": "unshuffled_deduplicated_nap", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00f2 AUDIT \\\\u00ed \\\\u00c7 \\\\u00e8 \\\\u00ee \\\\u00ff \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nds": {"config_name": "unshuffled_deduplicated_nds", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dor kann sik vun nu af an de hele plattd\\\\u00fc\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ne": {"config_name": "unshuffled_deduplicated_ne", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u092c\\\\u0930\\\\u094d\\\\u0926\\\\u093f\\\\u092c\\\\u093e\\\\u0938 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_new": {"config_name": "unshuffled_deduplicated_new", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0930\\\\u093e\\\\u0917 \\\\u0938\\\\u0941\\\\u0939\\\\u093e \\\\u091...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nl": {"config_name": "unshuffled_deduplicated_nl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op vrijdag 31 augustus wordt het nieuwe studiejaa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nn": {"config_name": "unshuffled_deduplicated_nn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Planomtale krav til innhald Bakgrunn: Sp\\\\u00f8rsm...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_no": {"config_name": "unshuffled_deduplicated_no", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Slett midlertidige internett filer og informasjon...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_oc": {"config_name": "unshuffled_deduplicated_oc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"jizzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_or": {"config_name": "unshuffled_deduplicated_or", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0b2d\\\\u0b41\\\\u0b2c\\\\u0b28\\\\u0b47\\\\u0b36\\\\u0b4d\\\\u0b71\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_os": {"config_name": "unshuffled_deduplicated_os", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1. \\\\u041b\\\\u00e6\\\\u043f\\\\u043f\\\\u0443 \\\\u00e6\\\\u043c\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pa": {"config_name": "unshuffled_deduplicated_pa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a30\\\\u0a1c\\\\u0a3f: \\\\u0a28\\\\u0a70: PB/JL-138/2018-...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pam": {"config_name": "unshuffled_deduplicated_pam", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c1ku pu i Anak ning Al\\\\u00e1ya at ngeni ip\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pl": {"config_name": "unshuffled_deduplicated_pl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Wszyscy producenci Alkazar Opole Biuro Wydawnicze...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pms": {"config_name": "unshuffled_deduplicated_pms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dolina (an sloven; San Dorligo della Valle an ita...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pnb": {"config_name": "unshuffled_deduplicated_pnb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0641\\\\u0631\\\\u06cc\\\\u0646\\\\u06a9 \\\\u0628\\\\u0644\\\\u0646...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ps": {"config_name": "unshuffled_deduplicated_ps", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Many people usually use the time period \\\\u2018bus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pt": {"config_name": "unshuffled_deduplicated_pt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Voc\\\\u00ea pode estar lendo este texto no sof\\\\u00e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_qu": {"config_name": "unshuffled_deduplicated_qu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Topeka nisqa llaqtaqa, Kansas suyup, Hukllachasqa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_rm": {"config_name": "unshuffled_deduplicated_rm", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"2. secziun Elavuraziun da datas e protecziun da d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ro": {"config_name": "unshuffled_deduplicated_ro", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201c\\\\u00cen via\\\\u021b\\\\u0103, oportunitatea nu e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ru": {"config_name": "unshuffled_deduplicated_ru", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0432\\\\u0430\\\\u0448\\\\u0438...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sa": {"config_name": "unshuffled_deduplicated_sa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u0928\\\\u093f\\\\u0930\\\\u0941\\\\u0926\\\\u094d\\\\u0927\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sah": {"config_name": "unshuffled_deduplicated_sah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_scn": {"config_name": "unshuffled_deduplicated_scn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La gilus\\\\u00eca \\\\u00e8 nu sintimentu dulurusu ca ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sd": {"config_name": "unshuffled_deduplicated_sd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0647\\\\u0631 \\\\u06aa\\\\u0648 \\\\u0684\\\\u0627\\\\u06bb\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sh": {"config_name": "unshuffled_deduplicated_sh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op\\\\u0161tina Gornja Radgona se nalazi u sjeverois...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_si": {"config_name": "unshuffled_deduplicated_si", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0da2\\\\u0db1\\\\u0dcf\\\\u0db0\\\\u0dd2\\\\u0db4\\\\u0dad\\\\u0dd2\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sk": {"config_name": "unshuffled_deduplicated_sk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Aktivity | Agent\\\\u00fara podporovan\\\\u00e9ho zames...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sl": {"config_name": "unshuffled_deduplicated_sl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u010ce Creatures, ki je \\\\u017eelel, da pridejo n...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_so": {"config_name": "unshuffled_deduplicated_so", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sq": {"config_name": "unshuffled_deduplicated_sq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c7far\\\\u00eb do t\\\\u00eb m\\\\u00eb p\\\\u00eblqente ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sr": {"config_name": "unshuffled_deduplicated_sr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*\\\\u041e\\\\u0432\\\\u0430 \\\\u043f\\\\u043e\\\\u0440\\\\u0443\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_su": {"config_name": "unshuffled_deduplicated_su", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Lalaki n\\\\u00e9mbongkeun kakuatan jeung vigor jeun...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sv": {"config_name": "unshuffled_deduplicated_sv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1783 \\\\u00e4r ett viktigt \\\\u00e5rtal i den nya tid...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sw": {"config_name": "unshuffled_deduplicated_sw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zarif: Iran inajua mpango wa Saudia wa kufanya ma...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ta": {"config_name": "unshuffled_deduplicated_ta", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0baa\\\\u0bca\\\\u0bb4\\\\u0bc1\\\\u0ba4\\\\u0bc1 \\\\u0b9a\\\\u0bbe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_te": {"config_name": "unshuffled_deduplicated_te", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0c39\\\\u0c30\\\\u0c4d\\\\u0c2f\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c32\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tg": {"config_name": "unshuffled_deduplicated_tg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u04b2\\\\u0443\\\\u043c\\\\u0430\\\\u0439\\\\u0440\\\\u043e \\\\u0433...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_th": {"config_name": "unshuffled_deduplicated_th", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0e1f\\\\u0e31\\\\u0e19\\\\u0e17\\\\u0e35\\\\u0e48\\\\u0e41\\\\u0e25\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tk": {"config_name": "unshuffled_deduplicated_tk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Amerikany\\\\u0148 Kaliforni\\\\u00fda \\\\u015ftatyndaky ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tl": {"config_name": "unshuffled_deduplicated_tl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Warning Signs na Sira ang Kidneys o Bato - ni Doc...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tr": {"config_name": "unshuffled_deduplicated_tr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Son y\\\\u0131llarda g\\\\u00f6r\\\\u00fclen ay tutulmalar...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tt": {"config_name": "unshuffled_deduplicated_tt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\\\\"\\\\u0418\\\\u0440\\\\u0435\\\\u043c\\\\u043d\\\\u0435\\\\u04a3 \\\\u04...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tyv": {"config_name": "unshuffled_deduplicated_tyv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0438\\\\u0438, \\\\u0445\\\\u04af\\\\u043d\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ug": {"config_name": "unshuffled_deduplicated_ug", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0632\\\\u0627\\\\u06ad-\\\\u0621\\\\u062a\\\\u06c7\\\\u0632\\\\u0649...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_uk": {"config_name": "unshuffled_deduplicated_uk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041f\\\\u0440\\\\u043e \\\\u043d\\\\u0430\\\\u0434\\\\u0430\\\\u043d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ur": {"config_name": "unshuffled_deduplicated_ur", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0626\\\\u06cc\\\\u06d2 \\\\u0627\\\\u06c1\\\\u0645 \\\\u062...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_uz": {"config_name": "unshuffled_deduplicated_uz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Markazi Sariosiyo shaharchasi. 1926-yil 29-sentab...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vec": {"config_name": "unshuffled_deduplicated_vec", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Par ogni p\\\\u00f3nto, \\\\u0142a derivada \\\\u0142a xe ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vi": {"config_name": "unshuffled_deduplicated_vi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Canh chua c\\\\u00e1 b\\\\u00f4ng lau kh\\\\u00f4ng ch\\\\u1e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vo": {"config_name": "unshuffled_deduplicated_vo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mettweiler binon zif in fedal\\\\u00e4n: Rheinland-P...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_wa": {"config_name": "unshuffled_deduplicated_wa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ci n' est n\\\\u00e9n l' viyaedje lu-minme ki sait e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_war": {"config_name": "unshuffled_deduplicated_war", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"An Tajan amo in usa ka komyun ha departamento han...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_wuu": {"config_name": "unshuffled_deduplicated_wuu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u4f0a15 [I] | \\\\u4f0a17 | \\\\u4f0a19 | \\\\u4f0a21 | \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_xal": {"config_name": "unshuffled_deduplicated_xal", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u0440\\\\u043d\\\\u0433\\\\u0443\\\\u0434\\\\u0438\\\\u043d ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_xmf": {"config_name": "unshuffled_deduplicated_xmf", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10db\\\\u10dd\\\\u10e9\\\\u10d0\\\\u10db\\\\u10d8\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yi": {"config_name": "unshuffled_deduplicated_yi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9\\\\u05d5\\\\u05ea\\\\u05d3\\\\u05d9\\\\u05e7 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yo": {"config_name": "unshuffled_deduplicated_yo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Copyright \\\\u00a9 2018 BBC. BBC k\\\\u00f2 m\\\\u1ecd\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yue": {"config_name": "unshuffled_deduplicated_yue", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*hughughughughughughughughughughughughughughughug...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_zh": {"config_name": "unshuffled_deduplicated_zh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u65f6\\\\u95f4\\\\u53ef\\\\u4ee5\\\\u88ab\\\\u7f29\\\\u77ed\\\\uff0c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_af": {"config_name": "unshuffled_original_af", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"aanlyn markte as gevolg van ons voortgesette 'n b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_als": {"config_name": "unshuffled_original_als", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dr 6. Augschte isch dr 218. Dag vum Gregorianisch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_am": {"config_name": "unshuffled_original_am", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u12a0\\\\u12e8\\\\u122d \\\\u1218\\\\u1295\\\\u1308\\\\u12f1 \\\\u12a...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_an": {"config_name": "unshuffled_original_an", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0648\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ar": {"config_name": "unshuffled_original_ar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0645\\\\u0631\\\\u062d\\\\u0628\\\\u0627 \\\\u0628\\\\u0643 \\\\u063...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_arz": {"config_name": "unshuffled_original_arz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"..\\\\u064c::\\\\u064c:: \\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0627...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_as": {"config_name": "unshuffled_original_as", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0986\\\\u09ae\\\\u09bf, \\\\u098f\\\\u0987 \\\\u09b8\\\\u0982\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ast": {"config_name": "unshuffled_original_ast", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"The Killers llanzaron el so \\\\u00e1lbum deb\\\\u00fa,...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_av": {"config_name": "unshuffled_original_av", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0438\\\\u043d\\\\u0434\\\\u0430 \\\\u043c\\\\u0430\\\\u043b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_az": {"config_name": "unshuffled_original_az", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"AZTV-Art\\\\u0131q 7 ildir ki, Ab\\\\u015feron rayonu d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_azb": {"config_name": "unshuffled_original_azb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0644\\\\u0639\\\\u0644\\\\u06cc \\\\u0661\\\\u0663-\\\\u062c\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ba": {"config_name": "unshuffled_original_ba", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u04d9\\\\u0441\\\\u0435\\\\u0440 \\\\u043c\\\\u0430\\\\u0442...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bar": {"config_name": "unshuffled_original_bar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\" ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bcl": {"config_name": "unshuffled_original_bcl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"& \\\\u00ff \\\\u00f3 / \\\\u00ed 0 - \\\\u00f8 \\\\u00fb \\\\u00f9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_be": {"config_name": "unshuffled_original_be", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0440\\\\u044d\\\\u0441\\\\u0446\\\\u043a\\\\u0456\\\\u044f ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bg": {"config_name": "unshuffled_original_bg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0410\\\\u041b\\\\u0411\\\\u041e\\\\u041f\\\\u041e\\\\u0414\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bh": {"config_name": "unshuffled_original_bh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0908 \\\\u0938\\\\u0947\\\\u0939\\\\u0924 \\\\u0906 \\\\u0938\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bn": {"config_name": "unshuffled_original_bn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09ad\\\\u09dc\\\\u0982 \\\\u09b8\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09b8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bo": {"config_name": "unshuffled_original_bo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0f56\\\\u0f7c\\\\u0f51\\\\u0f0b\\\\u0f58\\\\u0f72\\\\u0f0b\\\\u0f60\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bpy": {"config_name": "unshuffled_original_bpy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09aa\\\\u09cc\\\\u09b0\\\\u09b8\\\\u09ad\\\\u09be \\\\u098f\\\\u09b9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_br": {"config_name": "unshuffled_original_br", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Ar mank Magalh\\\\u00e3es(Daveo\\\\u00f9 a vank) a zo u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bs": {"config_name": "unshuffled_original_bs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u017e \\\\u0161\\\\u0159 \\\\u00e9 \\\\u00fa \\\\u0161\\\\u0159 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bxr": {"config_name": "unshuffled_original_bxr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0421\\\\u0430\\\\u0433\\\\u0430\\\\u0430\\\\u043d h\\\\u0430\\\\u044...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ca": {"config_name": "unshuffled_original_ca", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Daniel Vendrell, conegut com Vandrell, ha sigut u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cbk": {"config_name": "unshuffled_original_cbk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"yo gano yo gano yo gano yo gano yo gano yo gano y...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ce": {"config_name": "unshuffled_original_ce", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0413\\\\u0440\\\\u0435\\\\u043d\\\\u043b\\\\u0430\\\\u043d\\\\u0434\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ceb": {"config_name": "unshuffled_original_ceb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Si Isko walay pupamilok nga nagtan-aw sa unahan, ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ckb": {"config_name": "unshuffled_original_ckb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0631\\\\u0633\\\\u06cc \\\\u0631\\\\u06c6\\\\u0698 - \\\\u0633\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cs": {"config_name": "unshuffled_original_cs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oran\\\\u017eovou stuhu 2018 z\\\\u00edskala od Ministe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cv": {"config_name": "unshuffled_original_cv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u044b\\\\u0440\\\\u0430\\\\u043d\\\\u04d1 \\\\u0447\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cy": {"config_name": "unshuffled_original_cy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mae capeli Cymreig yr Andes ym Mhatagonia wedi cy...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_da": {"config_name": "unshuffled_original_da", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Den 2.-5. februar 2016 l\\\\u00f8b det tredje kursus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_de": {"config_name": "unshuffled_original_de", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dosierf\\\\u00f6rderb\\\\u00e4nder Getriebe Entw\\\\u00e4s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_diq": {"config_name": "unshuffled_original_diq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Z\\\\u0131wan\\\\u00ea Slawki, z\\\\u0131wano merduman\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_dsb": {"config_name": "unshuffled_original_dsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zg\\\\u00f3\\\\u0144\\\\u015bo w\\\\u011bcej w\\\\u00f3 l\\\\u011bp...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_dv": {"config_name": "unshuffled_original_dv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0784. \\\\u0787\\\\u07a6\\\\u078c\\\\u07ae\\\\u0785\\\\u07aa\\\\u078...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_el": {"config_name": "unshuffled_original_el", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u039d\\\\u03b5\\\\u03ba\\\\u03c1\\\\u03cc\\\\u03c2 \\\\u03b5\\\\u03bd...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eml": {"config_name": "unshuffled_original_eml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"A s\\\\u00e9guit dal pruc\\\\u00e8s ad rubuti\\\\u015basi\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_en": {"config_name": "unshuffled_original_en", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mtendere Village was inspired by the vision of Ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eo": {"config_name": "unshuffled_original_eo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0108u ... pre\\\\u011di | mediti | ricevi instigoj...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_es": {"config_name": "unshuffled_original_es", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Como se librar\\\\u00e1 de la celulitis en el gimnas...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_et": {"config_name": "unshuffled_original_et", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"MT\\\\u00dc AB Video j\\\\u00e4rgib oma tegevuses kodan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eu": {"config_name": "unshuffled_original_eu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Gure jarduerek eraikuntzarekin, elkarbizitzarekin...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fa": {"config_name": "unshuffled_original_fa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0645\\\\u0634\\\\u0628 \\\\u0628\\\\u0627\\\\u0631\\\\u0648...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fi": {"config_name": "unshuffled_original_fi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Luokka Kauniita tytt\\\\u00f6j\\\\u00e4, Teini, Porno p...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fr": {"config_name": "unshuffled_original_fr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"M\\\\u00e9dia de d\\\\u00e9bat d'id\\\\u00e9es, de culture...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_frr": {"config_name": "unshuffled_original_frr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hiragana\\\\u2019 Practice\\\\u2019Sheet\\\\u20191\\\\u2019(A...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fy": {"config_name": "unshuffled_original_fy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Veen, Klaas F. van der et al1984-2011Wurdboek fan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ga": {"config_name": "unshuffled_original_ga", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Is f\\\\u00f3ram \\\\u00e9 seo chun pl\\\\u00e9 a dh\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gd": {"config_name": "unshuffled_original_gd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zhou Yujun, a 'ph\\\\u00e0rtaidh R\\\\u00f9naire Comata...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gl": {"config_name": "unshuffled_original_gl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"O persoal de Inditex da provincia de Pontevedra s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gn": {"config_name": "unshuffled_original_gn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oiko pete\\\\u0129 kirir\\\\u0129 \\\\u00f1emond\\\\u00fdi pe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gom": {"config_name": "unshuffled_original_gom", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u091c\\\\u093e\\\\u0915\\\\u0902\\\\u0920\\\\u0940\\\\u0902\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gu": {"config_name": "unshuffled_original_gu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a85\\\\u0aa7\\\\u0abf\\\\u0a95 \\\\u0aae\\\\u0abe\\\\u0ab8 \\\\u0a9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_he": {"config_name": "unshuffled_original_he", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05d6\\\\u05e7\\\\u05d5\\\\u05e7\\\\u05d9\\\\u05dd \\\\u05dc\\\\u05e8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hi": {"config_name": "unshuffled_original_hi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'\\\\u0906\\\\u0907\\\\u091f\\\\u092e \\\\u0917\\\\u0930\\\\u094d\\\\u093...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hr": {"config_name": "unshuffled_original_hr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"U raspravi je sudjelovao i HSS-ov saborski zastup...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hsb": {"config_name": "unshuffled_original_hsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Budy\\\\u0161in (SN/B\\\\u0160e). Elektronikarjo m\\\\u011...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ht": {"config_name": "unshuffled_original_ht", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u043d\\\\u0430\\\\u0447\\\\u0430\\\\u0442\\\\u044c us $ nan us ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hu": {"config_name": "unshuffled_original_hu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"monster - Amat\\\\u0151r, h\\\\u00e1zi szex vide\\\\u00f3k...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hy": {"config_name": "unshuffled_original_hy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0531\\\\u0580\\\\u0581\\\\u0561\\\\u056d\\\\u056b \\\\u0540\\\\u0561...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ia": {"config_name": "unshuffled_original_ia", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha h...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_id": {"config_name": "unshuffled_original_id", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"~pic by cetusanminda. Marhalah yang sering disebu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ie": {"config_name": "unshuffled_original_ie", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Plastic Yo Yo Metal Yo Yos Wooden Yo Yo Keychain ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ilo": {"config_name": "unshuffled_original_ilo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Segun ken ni Ping-ay, ti yellow corn ti maysa kad...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_io": {"config_name": "unshuffled_original_io", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Chekia esas parlamentala republiko. La chefo di s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_is": {"config_name": "unshuffled_original_is", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Eyjar.net - uppl\\\\u00fdsinga- og fr\\\\u00e9ttami\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_it": {"config_name": "unshuffled_original_it", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La estrazione numero 48 del 10 e LOTTO ogni 5 min...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ja": {"config_name": "unshuffled_original_ja", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u795e\\\\u793e\\\\u306a\\\\u3069\\\\u3078\\\\u4e00\\\\u7dd2\\\\u306b\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_jbo": {"config_name": "unshuffled_original_jbo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"lo ni lo vacri cu glare cu banzuni lo nu ro da po...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_jv": {"config_name": "unshuffled_original_jv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Jos\\\\u00e9 Mourinho (diwaca: [\\\\u0292u\\\\u02c8z\\\\u025b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ka": {"config_name": "unshuffled_original_ka", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10ec\\\\u10d0\\\\u10db\\\\u10d8\\\\u10e7\\\\u10d5\\\\u10d0\\\\u10dc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kk": {"config_name": "unshuffled_original_kk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0422\\\\u04af\\\\u043b\\\\u043a\\\\u0456\\\\u0431\\\\u0430\\\\u0441 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_km": {"config_name": "unshuffled_original_km", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u1781\\\\u17d2\\\\u179f\\\\u17b9\\\\u1794\\\\u178a\\\\u17b6\\\\u1780\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kn": {"config_name": "unshuffled_original_kn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0caa\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ko": {"config_name": "unshuffled_original_ko", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"CIA \\\\ud504\\\\ub85c\\\\uc81d\\\\ud2b8\\\\uc5d0\\\\uc11c\\\\ub294 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_krc": {"config_name": "unshuffled_original_krc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u0430\\\\u043c\\\\u0445\\\\u0430\\\\u043d\\\\u043b\\\\u0430\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ku": {"config_name": "unshuffled_original_ku", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"R\\\\u00eaxistina maf\\\\u00ean mirovan Freedom House r...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kv": {"config_name": "unshuffled_original_kv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u043e\\\\u043c\\\\u0438 \\\\u043a\\\\u044b\\\\u0442\\\\u0448...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kw": {"config_name": "unshuffled_original_kw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ky": {"config_name": "unshuffled_original_ky", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Turmush: \\\\u0411\\\\u0438\\\\u0448\\\\u043a\\\\u0435\\\\u043a \\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_la": {"config_name": "unshuffled_original_la", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"H\\\\u00e6 sunt generationes No\\\\u00eb: No\\\\u00eb vir ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lb": {"config_name": "unshuffled_original_lb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"W\\\\u00e9i all Joers ass d'Fuesend nees eng m\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lez": {"config_name": "unshuffled_original_lez", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0441, \\\\u043b\\\\u0435\\\\u0437\\\\u0433\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_li": {"config_name": "unshuffled_original_li", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'t Good Goedenraad aan de Ezerbaek besjteit oet '...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lmo": {"config_name": "unshuffled_original_lmo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Calvens\\\\u00e0 l'\\\\u00e8 a 24 km de la sit\\\\u00e0 e ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lo": {"config_name": "unshuffled_original_lo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0ea7\\\\u0eb5\\\\u200b\\\\u0ec2\\\\u0ead\\\\u200b\\\\u0ec0\\\\u0ead\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lrc": {"config_name": "unshuffled_original_lrc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0631\\\\u0644\\\\u06cc\\\\u0646\\\\u06af\\\\u062a\\\\u0648\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lt": {"config_name": "unshuffled_original_lt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0160i programa pad\\\\u0117s geriau i\\\\u0161mokti i...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lv": {"config_name": "unshuffled_original_lv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Divdesmit pirmaj\\\\u0101 apr\\\\u012bl\\\\u012b m\\\\u016bsu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mai": {"config_name": "unshuffled_original_mai", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u096e \\\\u0938\\\\u093f\\\\u0924\\\\u092e\\\\u094d\\\\u092c\\\\u0930...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mg": {"config_name": "unshuffled_original_mg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hijery ny Tenim-Pirenena rehetra? Mandika ny tant...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mhr": {"config_name": "unshuffled_original_mhr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u043a\\\\u0440\\\\u0435\\\\u0442 \\\\u0436\\\\u0430\\\\u043f...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_min": {"config_name": "unshuffled_original_min", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"holaholaholaholaholaholaholaholaholaholaholaholah...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mk": {"config_name": "unshuffled_original_mk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201e\\\\u0424\\\\u0438\\\\u043b\\\\u043c \\\\u043f\\\\u043b\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ml": {"config_name": "unshuffled_original_ml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0d05\\\\u0d38\\\\u0d2d\\\\u0d4d\\\\u0d2f\\\\u0d35\\\\u0d41\\\\u0d02 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mn": {"config_name": "unshuffled_original_mn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041c\\\\u0423\\\\u0411\\\\u0418\\\\u0421-\\\\u044b\\\\u043d \\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mr": {"config_name": "unshuffled_original_mr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Home / motivational marathi story / \\\\u0909\\\\u0926\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mrj": {"config_name": "unshuffled_original_mrj", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041b\\\\u04f9\\\\u043f\\\\u04f9\\\\u0432\\\\u043b\\\\u04d3 (\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ms": {"config_name": "unshuffled_original_ms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Suhaib memang antara orang yang aktif berprogram....\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mt": {"config_name": "unshuffled_original_mt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"tibg\\\\u0127at il-kaw\\\\u017ca lura lill-Qorti \\\\u0120...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mwl": {"config_name": "unshuffled_original_mwl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Deciplina social i out\\\\u00f3noma que angloba ateb...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_my": {"config_name": "unshuffled_original_my", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u103b\\\\u1019\\\\u1040\\\\u1010\\\\u102e - \\\\u101b\\\\u1014\\\\u10...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_myv": {"config_name": "unshuffled_original_myv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0427\\\\u0430\\\\u0447\\\\u0441\\\\u044c 1914 \\\\u0443\\\\u043c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mzn": {"config_name": "unshuffled_original_mzn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0642\\\\u0631\\\\u0622\\\\u0646 \\\\u06cc\\\\u0627 \\\\u0642\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nah": {"config_name": "unshuffled_original_nah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"In m\\\\u0101cu\\\\u012blp\\\\u014dhualxihuitl VI (inic ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nap": {"config_name": "unshuffled_original_nap", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00f2 AUDIT \\\\u00ed \\\\u00c7 \\\\u00e8 \\\\u00ee \\\\u00ff \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nds": {"config_name": "unshuffled_original_nds", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dor kann sik vun nu af an de hele plattd\\\\u00fc\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ne": {"config_name": "unshuffled_original_ne", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u092c\\\\u0930\\\\u094d\\\\u0926\\\\u093f\\\\u092c\\\\u093e\\\\u0938 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_new": {"config_name": "unshuffled_original_new", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0930\\\\u093e\\\\u0917 \\\\u0938\\\\u0941\\\\u0939\\\\u093e \\\\u091...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nl": {"config_name": "unshuffled_original_nl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op vrijdag 31 augustus wordt het nieuwe studiejaa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nn": {"config_name": "unshuffled_original_nn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Planomtale krav til innhald Bakgrunn: Sp\\\\u00f8rsm...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_no": {"config_name": "unshuffled_original_no", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Slett midlertidige internett filer og informasjon...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_oc": {"config_name": "unshuffled_original_oc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"jizzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_or": {"config_name": "unshuffled_original_or", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0b2d\\\\u0b41\\\\u0b2c\\\\u0b28\\\\u0b47\\\\u0b36\\\\u0b4d\\\\u0b71\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_os": {"config_name": "unshuffled_original_os", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1. \\\\u041b\\\\u00e6\\\\u043f\\\\u043f\\\\u0443 \\\\u00e6\\\\u043c\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pa": {"config_name": "unshuffled_original_pa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a30\\\\u0a1c\\\\u0a3f: \\\\u0a28\\\\u0a70: PB/JL-138/2018-...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pam": {"config_name": "unshuffled_original_pam", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c1ku pu i Anak ning Al\\\\u00e1ya at ngeni ip\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pl": {"config_name": "unshuffled_original_pl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Wszyscy producenci Alkazar Opole Biuro Wydawnicze...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pms": {"config_name": "unshuffled_original_pms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dolina (an sloven; San Dorligo della Valle an ita...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pnb": {"config_name": "unshuffled_original_pnb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0641\\\\u0631\\\\u06cc\\\\u0646\\\\u06a9 \\\\u0628\\\\u0644\\\\u0646...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ps": {"config_name": "unshuffled_original_ps", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Many people usually use the time period \\\\u2018bus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pt": {"config_name": "unshuffled_original_pt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Voc\\\\u00ea pode estar lendo este texto no sof\\\\u00e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_qu": {"config_name": "unshuffled_original_qu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Topeka nisqa llaqtaqa, Kansas suyup, Hukllachasqa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_rm": {"config_name": "unshuffled_original_rm", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"2. secziun Elavuraziun da datas e protecziun da d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ro": {"config_name": "unshuffled_original_ro", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201c\\\\u00cen via\\\\u021b\\\\u0103, oportunitatea nu e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ru": {"config_name": "unshuffled_original_ru", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0432\\\\u0430\\\\u0448\\\\u0438...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sa": {"config_name": "unshuffled_original_sa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u0928\\\\u093f\\\\u0930\\\\u0941\\\\u0926\\\\u094d\\\\u0927\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sah": {"config_name": "unshuffled_original_sah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_scn": {"config_name": "unshuffled_original_scn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La gilus\\\\u00eca \\\\u00e8 nu sintimentu dulurusu ca ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sd": {"config_name": "unshuffled_original_sd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0647\\\\u0631 \\\\u06aa\\\\u0648 \\\\u0684\\\\u0627\\\\u06bb\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sh": {"config_name": "unshuffled_original_sh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op\\\\u0161tina Gornja Radgona se nalazi u sjeverois...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_si": {"config_name": "unshuffled_original_si", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0da2\\\\u0db1\\\\u0dcf\\\\u0db0\\\\u0dd2\\\\u0db4\\\\u0dad\\\\u0dd2\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sk": {"config_name": "unshuffled_original_sk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Aktivity | Agent\\\\u00fara podporovan\\\\u00e9ho zames...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sl": {"config_name": "unshuffled_original_sl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u010ce Creatures, ki je \\\\u017eelel, da pridejo n...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_so": {"config_name": "unshuffled_original_so", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sq": {"config_name": "unshuffled_original_sq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c7far\\\\u00eb do t\\\\u00eb m\\\\u00eb p\\\\u00eblqente ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sr": {"config_name": "unshuffled_original_sr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*\\\\u041e\\\\u0432\\\\u0430 \\\\u043f\\\\u043e\\\\u0440\\\\u0443\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_su": {"config_name": "unshuffled_original_su", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Lalaki n\\\\u00e9mbongkeun kakuatan jeung vigor jeun...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sv": {"config_name": "unshuffled_original_sv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1783 \\\\u00e4r ett viktigt \\\\u00e5rtal i den nya tid...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sw": {"config_name": "unshuffled_original_sw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zarif: Iran inajua mpango wa Saudia wa kufanya ma...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ta": {"config_name": "unshuffled_original_ta", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0baa\\\\u0bca\\\\u0bb4\\\\u0bc1\\\\u0ba4\\\\u0bc1 \\\\u0b9a\\\\u0bbe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_te": {"config_name": "unshuffled_original_te", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0c39\\\\u0c30\\\\u0c4d\\\\u0c2f\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c32\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tg": {"config_name": "unshuffled_original_tg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u04b2\\\\u0443\\\\u043c\\\\u0430\\\\u0439\\\\u0440\\\\u043e \\\\u0433...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_th": {"config_name": "unshuffled_original_th", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0e1f\\\\u0e31\\\\u0e19\\\\u0e17\\\\u0e35\\\\u0e48\\\\u0e41\\\\u0e25\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tk": {"config_name": "unshuffled_original_tk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Amerikany\\\\u0148 Kaliforni\\\\u00fda \\\\u015ftatyndaky ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tl": {"config_name": "unshuffled_original_tl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Warning Signs na Sira ang Kidneys o Bato - ni Doc...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tr": {"config_name": "unshuffled_original_tr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Son y\\\\u0131llarda g\\\\u00f6r\\\\u00fclen ay tutulmalar...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tt": {"config_name": "unshuffled_original_tt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\\\\"\\\\u0418\\\\u0440\\\\u0435\\\\u043c\\\\u043d\\\\u0435\\\\u04a3 \\\\u04...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tyv": {"config_name": "unshuffled_original_tyv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0438\\\\u0438, \\\\u0445\\\\u04af\\\\u043d\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ug": {"config_name": "unshuffled_original_ug", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0632\\\\u0627\\\\u06ad-\\\\u0621\\\\u062a\\\\u06c7\\\\u0632\\\\u0649...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_uk": {"config_name": "unshuffled_original_uk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041f\\\\u0440\\\\u043e \\\\u043d\\\\u0430\\\\u0434\\\\u0430\\\\u043d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ur": {"config_name": "unshuffled_original_ur", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0626\\\\u06cc\\\\u06d2 \\\\u0627\\\\u06c1\\\\u0645 \\\\u062...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_uz": {"config_name": "unshuffled_original_uz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Markazi Sariosiyo shaharchasi. 1926-yil 29-sentab...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vec": {"config_name": "unshuffled_original_vec", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Par ogni p\\\\u00f3nto, \\\\u0142a derivada \\\\u0142a xe ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vi": {"config_name": "unshuffled_original_vi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Canh chua c\\\\u00e1 b\\\\u00f4ng lau kh\\\\u00f4ng ch\\\\u1e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vo": {"config_name": "unshuffled_original_vo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mettweiler binon zif in fedal\\\\u00e4n: Rheinland-P...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_wa": {"config_name": "unshuffled_original_wa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ci n' est n\\\\u00e9n l' viyaedje lu-minme ki sait e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_war": {"config_name": "unshuffled_original_war", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"An Tajan amo in usa ka komyun ha departamento han...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_wuu": {"config_name": "unshuffled_original_wuu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u4f0a15 [I] | \\\\u4f0a17 | \\\\u4f0a19 | \\\\u4f0a21 | \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_xal": {"config_name": "unshuffled_original_xal", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u0440\\\\u043d\\\\u0433\\\\u0443\\\\u0434\\\\u0438\\\\u043d ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_xmf": {"config_name": "unshuffled_original_xmf", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10db\\\\u10dd\\\\u10e9\\\\u10d0\\\\u10db\\\\u10d8\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yi": {"config_name": "unshuffled_original_yi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9\\\\u05d5\\\\u05ea\\\\u05d3\\\\u05d9\\\\u05e7 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yo": {"config_name": "unshuffled_original_yo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Copyright \\\\u00a9 2018 BBC. BBC k\\\\u00f2 m\\\\u1ecd\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yue": {"config_name": "unshuffled_original_yue", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*hughughughughughughughughughughughughughughughug...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_zh": {"config_name": "unshuffled_original_zh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u65f6\\\\u95f4\\\\u53ef\\\\u4ee5\\\\u88ab\\\\u7f29\\\\u77ed\\\\uff0c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:als", "language:am", "language:an", "language:ar", "language:arz", "language:as", "language:ast", "language:av", "language:az", "language:azb", "language:ba", "language:bar", "language:bcl", "language:be", "language:bg", "language:bh", "language:bn", "language:bo", "language:bpy", "language:br", "language:bs", "language:bxr", "language:ca", "language:cbk", "language:ce", "language:ceb", "language:ckb", "language:cs", "language:cv", "language:cy", "language:da", "language:de", "language:diq", "language:dsb", "language:dv", "language:el", "language:eml", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:frr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gom", "language:gu", "language:he", "language:hi", "language:hr", "language:hsb", "language:ht", "language:hu", "language:hy", "language:ia", "language:id", "language:ie", "language:ilo", "language:io", "language:is", "language:it", "language:ja", "language:jbo", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:krc", "language:ku", "language:kv", "language:kw", "language:ky", "language:la", "language:lb", "language:lez", "language:li", "language:lmo", "language:lo", "language:lrc", "language:lt", "language:lv", "language:mai", "language:mg", "language:mhr", "language:min", "language:mk", "language:ml", "language:mn", "language:mr", "language:mrj", "language:ms", "language:mt", "language:mwl", "language:my", "language:myv", "language:mzn", "language:nah", "language:nap", "language:nds", "language:ne", "language:new", "language:nl", "language:nn", "language:no", "language:oc", "language:or", "language:os", "language:pa", "language:pam", "language:pl", "language:pms", "language:pnb", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:ru", "language:sa", "language:sah", "language:scn", "language:sd", "language:sh", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:tg", "language:th", "language:tk", "language:tl", "language:tr", "language:tt", "language:tyv", "language:ug", "language:uk", "language:ur", "language:uz", "language:vec", "language:vi", "language:vo", "language:wa", "language:war", "language:wuu", "language:xal", "language:xmf", "language:yi", "language:yo", "language:yue", "language:zh"], "is_gated": false}, "para_pat": {"dataset_name": "para_pat", "description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.", "downloads": 2972, "configs": {"el-en": {"config_name": "el-en", "sample_row": "{\"index\": \"844\", \"family_id\": \"10944407\", \"translation.el\": \"\\\"\\\\u03b1\\\\u03c6\\\\u03ad\\\\u03c2 \\\\u03bf \\\\u03bf\\\\u03c0\\\\u03b...\", \"translation.en\": \"\\\"offee prepared using the mix for Greek coffee eit...\"}", "columns": ["index", "family_id", "translation_el", "translation_en"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"index\": \"1372\", \"family_id\": \"6516810\", \"translation.cs\": \"\\\"\\\\u0158e\\\\u0161en\\\\u00ed se t\\\\u00fdk\\\\u00e1 herbicid\\\\...\", \"translation.en\": \"\\\"The present invention relates to herbicides and p...\"}", "columns": ["index", "family_id", "translation_cs", "translation_en"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.cs": "translation_cs", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-hu": {"config_name": "en-hu", "sample_row": "{\"index\": \"16\", \"family_id\": \"4180910\", \"translation.en\": \"\\\"Module containing solar cells (7), having two mut...\", \"translation.hu\": \"\\\"Napelemeket (7) tartalmaz\\\\u00f3 modul, amelynek k...\"}", "columns": ["index", "family_id", "translation_en", "translation_hu"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.hu": "translation_hu"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ro": {"config_name": "en-ro", "sample_row": "{\"index\": \"16\", \"family_id\": \"6111771\", \"translation.en\": \"\\\"The invention relates to a process for the prepar...\", \"translation.ro\": \"\\\"Inventia se refera la un procedeu pentru obtinere...\"}", "columns": ["index", "family_id", "translation_en", "translation_ro"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.ro": "translation_ro"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-sk": {"config_name": "en-sk", "sample_row": "{\"index\": \"758\", \"family_id\": \"5346370\", \"translation.en\": \"\\\"Delay is converted from synchronous the culture ...\", \"translation.sk\": \"\\\"Slaehtenie sa p\\\\u0159ev\\\\u00e1d\\\\u00ed zo synchr\\\\u0...\"}", "columns": ["index", "family_id", "translation_en", "translation_sk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.sk": "translation_sk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-uk": {"config_name": "en-uk", "sample_row": "{\"index\": \"3421\", \"family_id\": \"52275661\", \"translation.en\": \"\\\"A replaceable handle to kitchen appliances compri...\", \"translation.uk\": \"\\\"\\\\u0417\\\\u043d\\\\u0456\\\\u043c\\\\u043d\\\\u0430 \\\\u0440\\\\u0443...\"}", "columns": ["index", "family_id", "translation_en", "translation_uk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.uk": "translation_uk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"index\": \"3077\", \"family_id\": \"8244348\", \"translation.es\": \"\\\"La presente invenci\\\\u00f3n se refiere a un proced...\", \"translation.fr\": \"\\\"L'invention concerne un proc\\\\u00e9d\\\\u00e9 de fabr...\"}", "columns": ["index", "family_id", "translation_es", "translation_fr"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"index\": \"18646\", \"family_id\": \"38723544\", \"translation.fr\": \"\\\"L'invention appartient au domaine de la microbiol...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u0437\\\\u043e\\\\u0431\\\\u0440\\\\u0435\\\\u0442\\\\u0435\\\\...\"}", "columns": ["index", "family_id", "translation_fr", "translation_ru"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Der Signalaustausch zwischen den Funktionseinheit...\", \"translation.fr\": \"\\\"L'\\\\u00e9change de signaux entre les unit\\\\u00e9s f...\"}", "columns": ["translation_de", "translation_fr"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ja": {"config_name": "en-ja", "sample_row": "{\"translation.en\": \"\\\"The computer system (200) is connected to a datab...\", \"translation.ja\": \"\\\"\\\\u30b3\\\\u30f3\\\\u30d4\\\\u30e5\\\\u30fc\\\\u30bf\\\\u30b7\\\\u30b9\\\\...\"}", "columns": ["translation_en", "translation_ja"], "columns_mapping": {"translation.en": "translation_en", "translation.ja": "translation_ja"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"A method for converting a series of m-bit informa...\", \"translation.es\": \"\\\"Se describe un m\\\\u00e9todo para convertir una ser...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"According to the invention, the method (1) compri...\", \"translation.fr\": \"\\\"Selon l'invention, le proc\\\\u00e9d\\\\u00e9 (1) compr...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "de-en": {"config_name": "de-en", "sample_row": "{\"translation.de\": \"\\\"Das Textilband (1) ist insbesondere als (Kreuz-) ...\", \"translation.en\": \"\\\"The textile band (1) is used particularly as (cro...\"}", "columns": ["translation_de", "translation_en"], "columns_mapping": {"translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ko": {"config_name": "en-ko", "sample_row": "{\"translation.en\": \"\\\"In a preferred embodiment, the TAMS includes a pr...\", \"translation.ko\": \"\\\"\\\\ubc14\\\\ub78c\\\\uc9c1\\\\ud55c \\\\uc2e4\\\\uc2dc\\\\uc608\\\\uc5d0...\"}", "columns": ["translation_en", "translation_ko"], "columns_mapping": {"translation.en": "translation_en", "translation.ko": "translation_ko"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ja": {"config_name": "fr-ja", "sample_row": "{\"translation.fr\": \"\\\"Le dispositif de transmission transmet un signal ...\", \"translation.ja\": \"\\\"\\\\u3010\\\\u89e3\\\\u6c7a\\\\u624b\\\\u6bb5\\\\u3011\\\\u672c\\\\u6280\\\\...\"}", "columns": ["translation_fr", "translation_ja"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ja": "translation_ja"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"The door lock is characterized in that fluorescen...\", \"translation.zh\": \"\\\"\\\\u672c\\\\u5b9e\\\\u7528\\\\u65b0\\\\u578b\\\\u5305\\\\u62ec\\\\u95e8\\\\...\"}", "columns": ["translation_en", "translation_zh"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"S from 00 00\\\"\", \"translation.ru\": \"\\\"S \\\\u0441\\\\u043e 00 00\\\"\"}", "columns": ["translation_en", "translation_ru"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ko": {"config_name": "fr-ko", "sample_row": "{\"index\": \"10794\", \"family_id\": \"34746474\", \"translation.fr\": \"\\\"La pr\\\\u00e9sente invention concerne un proc\\\\u00e9...\", \"translation.ko\": \"\\\"\\\\ubcf8 \\\\ubc1c\\\\uba85\\\\uc740 \\\\uc6a9\\\\ub9e4\\\\uc911\\\\uc75...\"}", "columns": ["index", "family_id", "translation_fr", "translation_ko"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.fr": "translation_fr", "translation.ko": "translation_ko"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "ru-uk": {"config_name": "ru-uk", "sample_row": "{\"index\": \"3431\", \"family_id\": \"52281850\", \"translation.ru\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0441\\\\u043e\\\\u0431 \\\\u0432\\\\u044b...\", \"translation.uk\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0441\\\\u0456\\\\u0431 \\\\u0432\\\\u0438...\"}", "columns": ["index", "family_id", "translation_ru", "translation_uk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.ru": "translation_ru", "translation.uk": "translation_uk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-pt": {"config_name": "en-pt", "sample_row": "{\"index\": \"22818\", \"family_id\": \"40951751\", \"translation.en\": \"\\\"The present invention relates to a process for th...\", \"translation.pt\": \"\\\"A presente inven\\\\u00e7\\\\u00e3o refere-se a um proc...\"}", "columns": ["index", "family_id", "translation_en", "translation_pt"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:translation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:machine-generated", "multilinguality:translation", "source_datasets:original", "language:cs", "language:de", "language:el", "language:en", "language:es", "language:fr", "language:hu", "language:ja", "language:ko", "language:pt", "language:ro", "language:ru", "language:sk", "language:uk", "language:zh"], "is_gated": false}, "parsinlu_reading_comprehension": {"dataset_name": "parsinlu_reading_comprehension", "description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph).\nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.", "downloads": 304, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"question\": \"\\\"\\\\u0686\\\\u0631\\\\u0627 \\\\u0622\\\\u0645\\\\u0631\\\\u06cc\\\\u06a9...\", \"url\": \"\\\"https://www.bbc.com/persian/iran-46851613\\\"\", \"context\": \"\\\"\\\\u0644\\\\u0647\\\\u0633\\\\u062a\\\\u0627\\\\u0646 \\\\u06cc\\\\u06a9...\", \"answers.answer_start\": \"[427]\", \"answers.answer_text\": \"[\\\"\\\\u0646\\\\u0632\\\\u062f\\\\u06cc\\\\u06a9\\\\u06cc \\\\u0631\\\\u064...\"}", "columns": ["question", "url", "context", "answers_answer_start", "answers_answer_text"], "columns_mapping": {"question": "question", "url": "url", "context": "context", "answers.answer_start": "answers_answer_start", "answers.answer_text": "answers_answer_text"}, "dataset_description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph).\nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.\n", "dataset_name": "parsinlu_reading_comprehension"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia|google", "language:fa"], "is_gated": false}, "paws-x": {"dataset_name": "paws-x", "description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 26191, "configs": {"en": {"config_name": "en", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"In Paris , in October 1560 , he secretly met the ...\", \"sentence2\": \"\\\"In October 1560 , he secretly met with the Englis...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"Im Oktober 1560 traf er sich in Paris heimlich mi...\", \"sentence2\": \"\\\"Im Oktober 1560 traf er sich heimlich mit dem eng...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"En Par\\\\u00eds, en octubre de 1560, se reuni\\\\u00f3...\", \"sentence2\": \"\\\"En octubre de 1560, se reuni\\\\u00f3 en secreto con...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"\\\\u00c0 Paris, en octobre 1560, il rencontra secr\\\\...\", \"sentence2\": \"\\\"En octobre 1560, il rencontra secr\\\\u00e8tement l'...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\u306b\\\\u30d1\\\\u30ea\\\\u3067\\\\u3001\\\\...\", \"sentence2\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\u3001\\\\u5f7c\\\\u306f\\\\u30d1\\\\u30ea\\\\...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "ko": {"config_name": "ko", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560 \\\\ub144 10 \\\\uc6d4 \\\\ud30c\\\\ub9ac\\\\uc5d0\\\\uc11c \\\\u...\", \"sentence2\": \"\\\"1560 \\\\ub144 10 \\\\uc6d4 \\\\uadf8\\\\ub294 \\\\ud30c\\\\ub9ac\\\\u...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\uff0c\\\\u4ed6\\\\u5728\\\\u5df4\\\\u9ece\\\\...\", \"sentence2\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\uff0c\\\\u4ed6\\\\u5728\\\\u5df4\\\\u9ece\\\\...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:extended|other-paws", "language:de", "language:en", "language:es", "language:fr", "language:ja", "language:ko", "language:zh", "paraphrase-identification"], "is_gated": false}, "paws": {"dataset_name": "paws", "description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 8002, "configs": {"labeled_final": {"config_name": "labeled_final", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"In Paris , in October 1560 , he secretly met the ...\", \"sentence2\": \"\\\"In October 1560 , he secretly met with the Englis...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}, "labeled_swap": {"config_name": "labeled_swap", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"`` B. i. seychellarum '' is smaller and shorter-w...\", \"sentence2\": \"\\\"`` B. i. seychellarum '' is smaller and shorter-o...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}, "unlabeled_final": {"config_name": "unlabeled_final", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"The film was remade in Telugu with the same name ...\", \"sentence2\": \"\\\"The film was written in Telugu with the same name...\", \"label\": \"1\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "paraphrase-identification"], "is_gated": false}, "pec": {"dataset_name": "pec", "description": "\\\r\nA dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.", "downloads": 562, "configs": {"happy": {"config_name": "happy", "sample_row": "{\"personas\": \"[\\\"i have a roku tv that came with a shitty basic r...\", \"context\": \"[\\\"found out this morning i got a job promotion ! !...\", \"context_speakers\": \"[\\\"HeWentToJared91\\\"]\", \"response\": \"\\\"whilst popping ?\\\"\", \"response_speaker\": \"\\\"Evref\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}, "offmychest": {"config_name": "offmychest", "sample_row": "{\"personas\": \"[\\\"i do n't want to see it .\\\", \\\"i 'd love to know m...\", \"context\": \"[\\\"i want to die . in last few months i lost my job...\", \"context_speakers\": \"[\\\"1wannadie\\\"]\", \"response\": \"\\\"hold on to life , look around you and realise wha...\", \"response_speaker\": \"\\\"Lulwafahd\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}, "all": {"config_name": "all", "sample_row": "{\"personas\": \"[\\\"i have a roku tv that came with a shitty basic r...\", \"context\": \"[\\\"found out this morning i got a job promotion ! !...\", \"context_speakers\": \"[\\\"HeWentToJared91\\\"]\", \"response\": \"\\\"whilst popping ?\\\"\", \"response_speaker\": \"\\\"Evref\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-retrieval", "task_ids:dialogue-modeling", "task_ids:utterance-retrieval", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "peoples_daily_ner": {"dataset_name": "peoples_daily_ner", "description": "People's Daily NER Dataset is a commonly used dataset for Chinese NER, with\ntext from People's Daily (\u4eba\u6c11\u65e5\u62a5), the largest official newspaper.\n\nThe dataset is in BIO scheme. Entity types are: PER (person), ORG (organization)\nand LOC (location).", "downloads": 829, "configs": {"peoples_daily_ner": {"config_name": "peoples_daily_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u6d77\\\", \\\"\\\\u9493\\\", \\\"\\\\u6bd4\\\", \\\"\\\\u8d5b\\\", \\\"\\\\u5730\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "People's Daily NER Dataset is a commonly used dataset for Chinese NER, with\ntext from People's Daily (\u4eba\u6c11\u65e5\u62a5), the largest official newspaper.\n\nThe dataset is in BIO scheme. Entity types are: PER (person), ORG (organization)\nand LOC (location).\n", "dataset_name": "peoples_daily_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "persian_ner": {"dataset_name": "persian_ner", "description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.", "downloads": 560, "configs": {"fold1": {"config_name": "fold1", "sample_row": "{\"tokens\": \"[\\\"\\\\u0628\\\\u0647\\\", \\\"\\\\u0639\\\\u0646\\\\u0648\\\\u0627\\\\u0646\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}, "fold2": {"config_name": "fold2", "sample_row": "{\"tokens\": \"[\\\"\\\\u0627\\\\u0641\\\\u0642\\\\u06cc\\\", \\\":\\\", \\\"0\\\", \\\"\\\\u0640\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}, "fold3": {"config_name": "fold3", "sample_row": "{\"tokens\": \"[\\\"\\\\u0627\\\\u0641\\\\u0642\\\\u06cc\\\", \\\":\\\", \\\"0\\\", \\\"\\\\u0640\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:fa"], "is_gated": false}, "php": {"dataset_name": "php", "description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M", "downloads": 838, "configs": {"fi-nl": {"config_name": "fi-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"PHP K\\\\u00e4sikirja\\\"\", \"translation.nl\": \"\\\"PHP Handleiding\\\"\"}", "columns": ["id", "translation_fi", "translation_nl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.nl": "translation_nl"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "it-ro": {"config_name": "it-ro", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Manuale PHP\\\"\", \"translation.ro\": \"\\\"Manual PHP\\\"\"}", "columns": ["id", "translation_it", "translation_ro"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.ro": "translation_ro"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "nl-sv": {"config_name": "nl-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.nl\": \"\\\"PHP Handleiding\\\"\", \"translation.sv\": \"\\\"PHP-manual\\\"\"}", "columns": ["id", "translation_nl", "translation_sv"], "columns_mapping": {"id": "id", "translation.nl": "translation_nl", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "en-it": {"config_name": "en-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"PHP Manual\\\"\", \"translation.it\": \"\\\"Manuale PHP\\\"\"}", "columns": ["id", "translation_en", "translation_it"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.it": "translation_it"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"PHP Manual\\\"\", \"translation.fr\": \"\\\"Manuel PHP\\\"\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:cs", "language:de", "language:en", "language:es", "language:fi", "language:fr", "language:he", "language:hu", "language:it", "language:ja", "language:ko", "language:nl", "language:pl", "language:pt", "language:ro", "language:ru", "language:sk", "language:sl", "language:sv", "language:tr", "language:tw", "language:zh"], "is_gated": false}, "etalab-ia/piaf": {"dataset_name": "etalab-ia/piaf", "description": "Piaf is a reading comprehension dataset. This version, published in February 2020, contains 3835 questions on French Wikipedia.", "downloads": 307, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"p140295443291664\\\"\", \"title\": \"\\\"Sport\\\"\", \"context\": \"\\\"Les d\\\\u00e9penses des m\\\\u00e9nages repr\\\\u00e9sent...\", \"question\": \"\\\"Combien de personnes travaillent au minist\\\\u00e8r...\", \"answers.text\": \"[\\\"100 000\\\"]\", \"answers.answer_start\": \"[472]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Piaf is a reading comprehension dataset. This version, published in February 2020, contains 3835 questions on French Wikipedia.\n", "dataset_name": "etalab-ia/piaf"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:fr"], "is_gated": false}, "gsarti/clean_mc4_it": {"dataset_name": "gsarti/clean_mc4_it", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 1135, "configs": {"tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended", "language:it"], "is_gated": false}, "gsarti/itacola": {"dataset_name": "gsarti/itacola", "description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.", "downloads": 468, "configs": {"scores": {"config_name": "scores", "sample_row": "{\"unique_id\": \"1\", \"source\": \"\\\"Graffi_1994\\\"\", \"acceptability\": \"1\", \"sentence\": \"\\\"Quest'uomo mi ha colpito.\\\"\"}", "columns": ["unique_id", "source", "acceptability", "sentence"], "columns_mapping": {"unique_id": "unique_id", "source": "source", "acceptability": "acceptability", "sentence": "sentence"}, "dataset_description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.\n", "dataset_name": "gsarti/itacola"}, "phenomena": {"config_name": "phenomena", "sample_row": "{\"unique_id\": \"1\", \"source\": \"\\\"Graffi_1994\\\"\", \"acceptability\": \"1\", \"sentence\": \"\\\"Quest'uomo mi ha colpito.\\\"\", \"cleft_construction\": \"0\", \"copular_construction\": \"0\", \"subject_verb_agreement\": \"1\", \"wh_islands_violations\": \"0\", \"simple\": \"0\", \"question\": \"0\", \"auxiliary\": \"1\", \"bind\": \"0\", \"indefinite_pronouns\": \"0\"}", "columns": ["unique_id", "source", "acceptability", "sentence", "cleft_construction", "copular_construction", "subject_verb_agreement", "wh_islands_violations", "simple", "question", "auxiliary", "bind", "indefinite_pronouns"], "columns_mapping": {"unique_id": "unique_id", "source": "source", "acceptability": "acceptability", "sentence": "sentence", "cleft_construction": "cleft_construction", "copular_construction": "copular_construction", "subject_verb_agreement": "subject_verb_agreement", "wh_islands_violations": "wh_islands_violations", "simple": "simple", "question": "question", "auxiliary": "auxiliary", "bind": "bind", "indefinite_pronouns": "indefinite_pronouns"}, "dataset_description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.\n", "dataset_name": "gsarti/itacola"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:it"], "is_gated": false}, "jegormeister/dutch-snli": {"dataset_name": "jegormeister/dutch-snli", "description": "This is the Dutch version of the original SNLI dataset. The translation was performed using Google Translate. Original SNLI available at https://nlp.stanford.edu/projects/snli/", "downloads": 277, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"premise\": \"\\\"Een persoon op een paard springt over een kapot v...\", \"hypothesis\": \"\\\"Een persoon traint zijn paard voor een wedstrijd....\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "This is the Dutch version of the original SNLI dataset. The translation was performed using Google Translate. Original SNLI available at https://nlp.stanford.edu/projects/snli/\n", "dataset_name": "jegormeister/dutch-snli"}}, "tags": ["language:nl"], "is_gated": false}, "ju-bezdek/conll2003-SK-NER": {"dataset_name": "ju-bezdek/conll2003-SK-NER", "description": "This is translated version of the original CONLL2003 dataset (translated from English to Slovak via Google translate) Annotation was done mostly automatically with word matching scripts. Records where some tags were not matched, were annotated manually (10%) Unlike the original Conll2003 dataset, this one contains only NER tags", "downloads": 11, "configs": {"conll2003-SK-NER": {"config_name": "conll2003-SK-NER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"E\\\\u00da\\\", \\\"odmieta\\\", \\\"nemeck\\\\u00fa\\\", \\\"v\\\\u00fdzvu...\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "This is translated version of the original CONLL2003 dataset (translated from English to Slovak via Google translate) Annotation was done mostly automatically with word matching scripts. Records where some tags were not matched, were annotated manually (10%) Unlike the original Conll2003 dataset, this one contains only NER tags\n", "dataset_name": "ju-bezdek/conll2003-SK-NER"}}, "tags": ["task_categories:other", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|conll2003", "language:sk", "structure-prediction"], "is_gated": false}, "k-halid/ar": {"dataset_name": "k-halid/ar", "description": "The corpus is a part of the MultiUN corpus.It is a collection of translated documents from the United Nations.The corpus is download from the following website : [open parallel corpus](http://opus.datasetsl.eu/) \\", "downloads": 12, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"\\\\u0631\\\\u0633\\\\u0627\\\\u0644\\\\u0629 \\\\u0645\\\\u0624\\\\u0631...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "The corpus is a part of the MultiUN corpus.It is a collection of translated documents from the United Nations.The corpus is download from the following website : [open parallel corpus](http://opus.datasetsl.eu/) ", "dataset_name": "k-halid/ar"}}, "tags": [], "is_gated": false}, "lavis-nlp/german_legal_sentences": {"dataset_name": "lavis-nlp/german_legal_sentences", "description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)", "downloads": 29, "configs": {"sentences": {"config_name": "sentences", "sample_row": "{\"sent_id\": \"1710015\", \"doc_id\": \"201218\", \"text\": \"\\\"Die nach [REF] zul\\\\u00e4ssige Beschwerde ist auch...\", \"references.ref_id\": \"[6565]\", \"references.name\": \"[\\\"\\\\u00a7 127 Abs. 2 Satz 2 ZPO\\\"]\", \"references.type\": \"[1]\"}", "columns": ["sent_id", "doc_id", "text", "references_ref_id", "references_name", "references_type"], "columns_mapping": {"sent_id": "sent_id", "doc_id": "doc_id", "text": "text", "references.ref_id": "references_ref_id", "references.name": "references_name", "references.type": "references_type"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}, "pairs": {"config_name": "pairs", "sample_row": "{\"query.sent_id\": \"0\", \"query.doc_id\": \"0\", \"query.text\": \"\\\"Gem\\\\u00e4\\\\u00df [REF] kann der Vertrieb eines nac...\", \"query.ref_ids\": \"[8]\", \"related.sent_id\": \"167082\", \"related.doc_id\": \"14964\", \"related.text\": \"\\\"Die Revision wendet sich mit Erfolg gegen die Ann...\", \"related.ref_ids\": \"[141578, 8]\"}", "columns": ["query_sent_id", "query_doc_id", "query_text", "query_ref_ids", "related_sent_id", "related_doc_id", "related_text", "related_ref_ids"], "columns_mapping": {"query.sent_id": "query_sent_id", "query.doc_id": "query_doc_id", "query.text": "query_text", "query.ref_ids": "query_ref_ids", "related.sent_id": "related_sent_id", "related.doc_id": "related_doc_id", "related.text": "related_text", "related.ref_ids": "related_ref_ids"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}, "pairs+es": {"config_name": "pairs+es", "sample_row": "{\"query.sent_id\": \"0\", \"query.doc_id\": \"0\", \"query.text\": \"\\\"Gem\\\\u00e4\\\\u00df [REF] kann der Vertrieb eines nac...\", \"query.ref_ids\": \"[8]\", \"related.sent_id\": \"167082\", \"related.doc_id\": \"14964\", \"related.text\": \"\\\"Die Revision wendet sich mit Erfolg gegen die Ann...\", \"related.ref_ids\": \"[141578, 8]\", \"es_neighbors.text\": \"[\\\"Besondere Umst\\\\u00e4nde , die das Verhalten des ...\", \"es_neighbors.sent_id\": \"[1349763, 1407242, 996686, 751840, 304375]\", \"es_neighbors.doc_id\": \"[149748, 156752, 107019, 78568, 28812]\", \"es_neighbors.ref_ids\": \"[[399], [691], [15], [141578], [7115, 62763]]\"}", "columns": ["query_sent_id", "query_doc_id", "query_text", "query_ref_ids", "related_sent_id", "related_doc_id", "related_text", "related_ref_ids", "es_neighbors_text", "es_neighbors_sent_id", "es_neighbors_doc_id", "es_neighbors_ref_ids"], "columns_mapping": {"query.sent_id": "query_sent_id", "query.doc_id": "query_doc_id", "query.text": "query_text", "query.ref_ids": "query_ref_ids", "related.sent_id": "related_sent_id", "related.doc_id": "related_doc_id", "related.text": "related_text", "related.ref_ids": "related_ref_ids", "es_neighbors.text": "es_neighbors_text", "es_neighbors.sent_id": "es_neighbors_sent_id", "es_neighbors.doc_id": "es_neighbors_doc_id", "es_neighbors.ref_ids": "es_neighbors_ref_ids"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}}, "tags": ["task_categories:text-retrieval", "task_ids:semantic-similarity-scoring", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:de"], "is_gated": false}, "lhoestq/test": {"dataset_name": "lhoestq/test", "description": "This is a test dataset.", "downloads": 355, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"- Hello there !\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is a test dataset.\n", "dataset_name": "lhoestq/test"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "liweili/c4_200m": {"dataset_name": "liweili/c4_200m", "description": "\\\r\nGEC Dataset Generated from C4", "downloads": 29, "configs": {"default": {"config_name": "default", "sample_row": "{\"input\": \"\\\"Bitcoin is for $7,094 this morning, which CoinDes...\", \"output\": \"\\\"Bitcoin goes for $7,094 this morning, according t...\"}", "columns": ["input", "output"], "columns_mapping": {"input": "input", "output": "output"}, "dataset_description": "GEC Dataset Generated from C4\n", "dataset_name": "liweili/c4_200m"}}, "tags": ["task_categories:text-generation", "source_datasets:allenai/c4", "language:en", "grammatical-error-correction"], "is_gated": false}, "metaeval/blimp_classification": {"dataset_name": "metaeval/blimp_classification", "description": "Acceptable/non acceptable sentences (recasted as a classification task)", "downloads": 45, "configs": {"semantics": {"config_name": "semantics", "sample_row": "{\"sentence\": \"\\\"There was each vase aggravating Carol.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax": {"config_name": "syntax", "sample_row": "{\"sentence\": \"\\\"Many senators were collaborated by Lucille.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "morphology": {"config_name": "morphology", "sample_row": "{\"sentence\": \"\\\"Some girl hired that pedestrians.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax+semantics": {"config_name": "syntax+semantics", "sample_row": "{\"sentence\": \"\\\"Debra is imagining herself looking like these pho...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax_semantics": {"config_name": "syntax_semantics", "sample_row": "{\"sentence\": \"\\\"Ronald preferred it to be interesting that Rhonda...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "language:en", "cola"], "is_gated": false}, "tasksource/crowdflower": {"dataset_name": "tasksource/crowdflower", "description": "Collection of crowdflower classification datasets", "downloads": 78, "configs": {"sentiment_nuclear_power": {"config_name": "sentiment_nuclear_power", "sample_row": "{\"text\": \"\\\":Hello Japan is a nuclear power plant crisis. {li...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "tweet_global_warming": {"config_name": "tweet_global_warming", "sample_row": "{\"text\": \"\\\"Global warming report urges governments to act|BR...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "airline-sentiment": {"config_name": "airline-sentiment", "sample_row": "{\"text\": \"\\\"@VirginAmerica What @dhepburn said.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "corporate-messaging": {"config_name": "corporate-messaging", "sample_row": "{\"text\": \"\\\"Barclays CEO stresses the importance of regulator...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "economic-news": {"config_name": "economic-news", "sample_row": "{\"text\": \"\\\"\\\\\\\"NEW YORK -- Yields on most certificates of depo...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-audience": {"config_name": "political-media-audience", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-bias": {"config_name": "political-media-bias", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-message": {"config_name": "political-media-message", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"2\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "text_emotion": {"config_name": "text_emotion", "sample_row": "{\"text\": \"\\\"@tiffanylue i know i was listenin to bad habit e...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/ethics": {"dataset_name": "metaeval/ethics", "description": "Probing for ethics understanding", "downloads": 1730, "configs": {"commonsense": {"config_name": "commonsense", "sample_row": "{\"text\": \"\\\"I went to the principal's office to change my rec...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "deontology": {"config_name": "deontology", "sample_row": "{\"text\": \"\\\"Aren't you supposed to be preparing for the meeti...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "justice": {"config_name": "justice", "sample_row": "{\"text\": \"\\\"I deserve to be paid by my boss because I keep he...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "virtue": {"config_name": "virtue", "sample_row": "{\"sentence1\": \"\\\"Jill wore a new style of distinctive clothes each...\", \"sentence2\": \"\\\" discredit\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence1", "sentence2", "label", "idx"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/linguisticprobing": {"dataset_name": "metaeval/linguisticprobing", "description": "10 probing tasks designed to capture simple linguistic features of sentences,", "downloads": 131, "configs": {"subj_number": {"config_name": "subj_number", "sample_row": "{\"sentence\": \"\\\"Coming from a xenophobic race that possesses the ...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "word_content": {"config_name": "word_content", "sample_row": "{\"sentence\": \"\\\"It just hadn 't seemed important, and he didn 't ...\", \"label\": \"552\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "obj_number": {"config_name": "obj_number", "sample_row": "{\"sentence\": \"\\\"Money would replace the drugs in the bags, once t...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "past_present": {"config_name": "past_present", "sample_row": "{\"sentence\": \"\\\"She shone her light around the space, following t...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "sentence_length": {"config_name": "sentence_length", "sample_row": "{\"sentence\": \"\\\"But it was not here.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "top_constituents": {"config_name": "top_constituents", "sample_row": "{\"sentence\": \"\\\"I wanted to start asking questions now, but force...\", \"label\": \"7\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "tree_depth": {"config_name": "tree_depth", "sample_row": "{\"sentence\": \"\\\"Who knew who would be there?\\\"\", \"label\": \"5\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "coordination_inversion": {"config_name": "coordination_inversion", "sample_row": "{\"sentence\": \"\\\"She was a regular at the Friday charity sessions,...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "odd_man_out": {"config_name": "odd_man_out", "sample_row": "{\"sentence\": \"\\\"Gideon brought his phone to his ear and resonated...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "bigram_shift": {"config_name": "bigram_shift", "sample_row": "{\"sentence\": \"\\\"A week she'd been with the man, just a week, and ...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}}, "tags": ["task_categories:text-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/recast": {"dataset_name": "metaeval/recast", "description": "A diverse collection of tasks recasted as natural language inference tasks.", "downloads": 83, "configs": {"recast_kg_relations": {"config_name": "recast_kg_relations", "sample_row": "{\"context\": \"\\\"Diplomats say Assad 's absence from the meeting a...\", \"hypothesis\": \"\\\"Assad was buried in Syria .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_puns": {"config_name": "recast_puns", "sample_row": "{\"context\": \"\\\"Michaela heard that the agreeable tennis umpire w...\", \"hypothesis\": \"\\\"Michaela heard a pun\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_factuality": {"config_name": "recast_factuality", "sample_row": "{\"context\": \"\\\"We had a larger black population in the 70s than ...\", \"hypothesis\": \"\\\"The having happened\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_verbnet": {"config_name": "recast_verbnet", "sample_row": "{\"context\": \"\\\"David constructed a house .\\\"\", \"hypothesis\": \"\\\"David caused the constructing .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_verbcorner": {"config_name": "recast_verbcorner", "sample_row": "{\"context\": \"\\\"Samantha enjoyed the blinch.\\\"\", \"hypothesis\": \"\\\"Something good happened .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_ner": {"config_name": "recast_ner", "sample_row": "{\"context\": \"\\\"Mexican President Felipe Calderon has sought more...\", \"hypothesis\": \"\\\"Mexican is a day of the week\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_sentiment": {"config_name": "recast_sentiment", "sample_row": "{\"context\": \"\\\"When asked about the product, Eniyah said, 'I had...\", \"hypothesis\": \"\\\"Eniyah liked the product . \\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_megaveridicality": {"config_name": "recast_megaveridicality", "sample_row": "{\"context\": \"\\\"someone confirmed that a particular thing happene...\", \"hypothesis\": \"\\\"that thing happened .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "nli", "natural-language-inference"], "is_gated": false}, "midas/inspec": {"dataset_name": "midas/inspec", "description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.", "downloads": 459, "configs": {"extraction": {"config_name": "extraction", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"doc_bio_tags\": \"[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\",...\"}", "columns": ["id", "document", "doc_bio_tags"], "columns_mapping": {"id": "id", "document": "document", "doc_bio_tags": "doc_bio_tags"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}, "generation": {"config_name": "generation", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"extractive_keyphrases\": \"[\\\"philosophy of mind\\\", \\\"content atomism\\\", \\\"ibs\\\", \\\"...\", \"abstractive_keyphrases\": \"[\\\"information-based semantics\\\"]\"}", "columns": ["id", "document", "extractive_keyphrases", "abstractive_keyphrases"], "columns_mapping": {"id": "id", "document": "document", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}, "raw": {"config_name": "raw", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"doc_bio_tags\": \"[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\",...\", \"extractive_keyphrases\": \"[\\\"philosophy of mind\\\", \\\"content atomism\\\", \\\"ibs\\\", \\\"...\", \"abstractive_keyphrases\": \"[\\\"information-based semantics\\\"]\", \"other_metadata.text\": \"[]\", \"other_metadata.bio_tags\": \"[]\"}", "columns": ["id", "document", "doc_bio_tags", "extractive_keyphrases", "abstractive_keyphrases", "other_metadata_text", "other_metadata_bio_tags"], "columns_mapping": {"id": "id", "document": "document", "doc_bio_tags": "doc_bio_tags", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "other_metadata.text": "other_metadata_text", "other_metadata.bio_tags": "other_metadata_bio_tags"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}}, "tags": [], "is_gated": false}, "midas/ldkp10k": {"dataset_name": "midas/ldkp10k", "description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.", "downloads": 10, "configs": {"small": {"config_name": "small", "sample_row": "{\"id\": \"\\\"18980016\\\"\", \"sections\": \"[\\\"introduction\\\", \\\"application to diffraction grati...\", \"sec_text\": \"[[\\\"New\\\", \\\"and\\\", \\\"interesting\\\", \\\"theoretical\\\", \\\"cha...\", \"extractive_keyphrases\": \"[]\", \"abstractive_keyphrases\": \"[\\\"quantum mechanics\\\", \\\"quantum physics\\\"]\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}, "medium": {"config_name": "medium", "sample_row": "{\"id\": \"\\\"18988100\\\"\", \"sections\": \"[\\\"introduction\\\", \\\"diffusion on ge(111)-c(2\\\\u00d78)...\", \"sec_text\": \"[[\\\"The\\\", \\\"successful\\\", \\\"production\\\", \\\"of\\\", \\\"electr...\", \"extractive_keyphrases\": \"[\\\"diffusion\\\", \\\"saddle point\\\", \\\"thin film\\\"]\", \"abstractive_keyphrases\": \"[\\\"materials science\\\", \\\"growth mechanism\\\", \\\"germani...\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}, "large": {"config_name": "large", "sample_row": "{\"id\": \"\\\"18980258\\\"\", \"sections\": \"[\\\"visual sleep staging is still the most widely us...\", \"sec_text\": \"[[\\\"TO\\\", \\\"SUBDIVIDE\\\", \\\"SLEEP\\\", \\\"RECORDINGS\\\", \\\"INTO\\\"...\", \"extractive_keyphrases\": \"[\\\"classification\\\"]\", \"abstractive_keyphrases\": \"[\\\"sleep wake cycle\\\", \\\"electrodiagnosis\\\", \\\"electrop...\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}}, "tags": [], "is_gated": false}, "mideind/icelandic-error-corpus-IceEC": {"dataset_name": "mideind/icelandic-error-corpus-IceEC", "description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.", "downloads": 32, "configs": {"fine-grained": {"config_name": "fine-grained", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}, "subcategory": {"config_name": "subcategory", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}, "category": {"config_name": "category", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:is"], "is_gated": false}, "ml6team/cnn_dailymail_nl": {"dataset_name": "ml6team/cnn_dailymail_nl", "description": " This dataset is the CNN/Dailymail dataset translated to Dutch.\n This is the original dataset:\n ```\n load_dataset(\"cnn_dailymail\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```", "downloads": 50, "configs": {"default": {"config_name": "default", "sample_row": "{\"article\": \"\\\"(CNN) -- de bewering van de Amerikaanse minister ...\", \"highlights\": \"\\\"Anti-terrorisme beleid leeft op de rand van het i...\", \"id\": \"\\\"0d8f8bad4680a1ab57197f60923e8cf71c748d6f\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": " This dataset is the CNN/Dailymail dataset translated to Dutch.\n This is the original dataset:\n ```\n load_dataset(\"cnn_dailymail\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```\n", "dataset_name": "ml6team/cnn_dailymail_nl"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:https://github.com/huggingface/datasets/tree/master/datasets/cnn_dailymail", "language:nl"], "is_gated": false}, "indonesian-nlp/mc4-id": {"dataset_name": "indonesian-nlp/mc4-id", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 72, "configs": {"tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended", "language:id"], "is_gated": false}, "mvarma/medwiki": {"dataset_name": "mvarma/medwiki", "description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.", "downloads": 37, "configs": {"medwiki_full": {"config_name": "medwiki_full", "sample_row": "{\"mentions\": \"[\\\"dahl\\\", \\\"dahl\\\", \\\"dahl\\\"]\", \"entities\": \"[\\\"C0600533\\\", \\\"C0600533\\\", \\\"C0600533\\\"]\", \"entity_titles\": \"[\\\"Rats, Inbred Dahl\\\", \\\"Rats, Inbred Dahl\\\", \\\"Rats, ...\", \"types\": \"[[\\\"Mammal\\\", \\\"writer\\\", \\\"poet\\\", \\\"screenwriter\\\", \\\"aut...\", \"spans\": \"[[10, 11], [12, 14], [16, 18]]\", \"sentence\": \"\\\"Receiving the 1983 World Fantasy Award for Life A...\", \"sent_idx_unq\": \"44000000\"}", "columns": ["mentions", "entities", "entity_titles", "types", "spans", "sentence", "sent_idx_unq"], "columns_mapping": {"mentions": "mentions", "entities": "entities", "entity_titles": "entity_titles", "types": "types", "spans": "spans", "sentence": "sentence", "sent_idx_unq": "sent_idx_unq"}, "dataset_description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.\n", "dataset_name": "mvarma/medwiki"}, "medwiki_hq": {"config_name": "medwiki_hq", "sample_row": "{\"mentions\": \"[\\\"czechoslovakia\\\"]\", \"entities\": \"[\\\"C0010872\\\"]\", \"entity_titles\": \"[\\\"Czechoslovakia\\\"]\", \"types\": \"[[\\\"Geographic Area\\\", \\\"historical country\\\", \\\"sovere...\", \"spans\": \"[[10, 11]]\", \"sentence\": \"\\\"The Czechoslovakia men 's national under-21 volle...\", \"sent_idx_unq\": \"44000003\"}", "columns": ["mentions", "entities", "entity_titles", "types", "spans", "sentence", "sent_idx_unq"], "columns_mapping": {"mentions": "mentions", "entities": "entities", "entity_titles": "entity_titles", "types": "types", "spans": "spans", "sentence": "sentence", "sent_idx_unq": "sent_idx_unq"}, "dataset_description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.\n", "dataset_name": "mvarma/medwiki"}}, "tags": ["task_categories:text-retrieval", "task_ids:entity-linking-retrieval", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia"], "is_gated": false}, "ought/raft": {"dataset_name": "ought/raft", "description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)", "downloads": 15611, "configs": {"ade_corpus_v2": {"config_name": "ade_corpus_v2", "sample_row": "{\"Sentence\": \"\\\"No regional side effects were noted.\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "banking_77": {"config_name": "banking_77", "sample_row": "{\"Query\": \"\\\"Is it possible for me to change my PIN number?\\\"...\", \"ID\": \"0\", \"Label\": \"23\"}", "columns": ["Query", "ID", "Label"], "columns_mapping": {"Query": "Query", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "terms_of_service": {"config_name": "terms_of_service", "sample_row": "{\"Sentence\": \"\\\"Crowdtangle may change these terms of service, as...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "tai_safety_research": {"config_name": "tai_safety_research", "sample_row": "{\"Title\": \"\\\"Malign generalization without internal search\\\"\", \"Abstract Note\": \"\\\"In my last post, I challenged the idea that inner...\", \"Url\": \"\\\"https://www.alignmentforum.org/posts/ynt9TD6PrYw6...\", \"Publication Year\": \"\\\"2020\\\"\", \"Item Type\": \"\\\"blogPost\\\"\", \"Author\": \"\\\"Barnett, Matthew\\\"\", \"Publication Title\": \"\\\"AI Alignment Forum\\\"\", \"ID\": \"0\", \"Label\": \"1\"}", "columns": ["Title", "Abstract Note", "Url", "Publication Year", "Item Type", "Author", "Publication Title", "ID", "Label"], "columns_mapping": {"Title": "Title", "Abstract Note": "Abstract Note", "Url": "Url", "Publication Year": "Publication Year", "Item Type": "Item Type", "Author": "Author", "Publication Title": "Publication Title", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "neurips_impact_statement_risks": {"config_name": "neurips_impact_statement_risks", "sample_row": "{\"Paper title\": \"\\\"Auto-Panoptic: Cooperative Multi-Component Archit...\", \"Paper link\": \"\\\"https://proceedings.neurips.cc/paper/2020/file/ec...\", \"Impact statement\": \"\\\"This work makes the first attempt to search for a...\", \"ID\": \"0\", \"Label\": \"1\"}", "columns": ["Paper title", "Paper link", "Impact statement", "ID", "Label"], "columns_mapping": {"Paper title": "Paper title", "Paper link": "Paper link", "Impact statement": "Impact statement", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "overruling": {"config_name": "overruling", "sample_row": "{\"Sentence\": \"\\\"in light of both our holding today and previous r...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "systematic_review_inclusion": {"config_name": "systematic_review_inclusion", "sample_row": "{\"Title\": \"\\\"Prototyping and transforming facial textures for ...\", \"Abstract\": \"\\\"Wavelet based methods for prototyping facial text...\", \"Authors\": \"\\\"Tiddeman, B.; Burt, M.; Perrett, D.\\\"\", \"Journal\": \"\\\"IEEE Comput Graphics Appl\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Title", "Abstract", "Authors", "Journal", "ID", "Label"], "columns_mapping": {"Title": "Title", "Abstract": "Abstract", "Authors": "Authors", "Journal": "Journal", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "one_stop_english": {"config_name": "one_stop_english", "sample_row": "{\"Article\": \"\\\"For 85 years, it was just a grey blob on classroo...\", \"ID\": \"0\", \"Label\": \"3\"}", "columns": ["Article", "ID", "Label"], "columns_mapping": {"Article": "Article", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "tweet_eval_hate": {"config_name": "tweet_eval_hate", "sample_row": "{\"Tweet\": \"\\\"New to Twitter-- any men on here know what the pr...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Tweet", "ID", "Label"], "columns_mapping": {"Tweet": "Tweet", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "twitter_complaints": {"config_name": "twitter_complaints", "sample_row": "{\"Tweet text\": \"\\\"@HMRCcustomers No this is my first job\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Tweet text", "ID", "Label"], "columns_mapping": {"Tweet text": "Tweet text", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "semiconductor_org_types": {"config_name": "semiconductor_org_types", "sample_row": "{\"Paper title\": \"\\\"3Gb/s AC-coupled chip-to-chip communication using...\", \"Organization name\": \"\\\"North Carolina State Univ.,Raleigh,NC,USA\\\"\", \"ID\": \"0\", \"Label\": \"3\"}", "columns": ["Paper title", "Organization name", "ID", "Label"], "columns_mapping": {"Paper title": "Paper title", "Organization name": "Organization name", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "source_datasets:extended|ade_corpus_v2", "source_datasets:extended|banking77", "language:en"], "is_gated": false}, "pasinit/xlwic": {"dataset_name": "pasinit/xlwic", "description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)", "downloads": 336, "configs": {"xlwic_en_bg": {"config_name": "xlwic_en_bg", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_zh": {"config_name": "xlwic_en_zh", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_hr": {"config_name": "xlwic_en_hr", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_da": {"config_name": "xlwic_en_da", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_nl": {"config_name": "xlwic_en_nl", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_et": {"config_name": "xlwic_en_et", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_fa": {"config_name": "xlwic_en_fa", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_ja": {"config_name": "xlwic_en_ja", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_ko": {"config_name": "xlwic_en_ko", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_it": {"config_name": "xlwic_en_it", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_fr": {"config_name": "xlwic_en_fr", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_de": {"config_name": "xlwic_en_de", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_it_it": {"config_name": "xlwic_it_it", "sample_row": "{\"id\": \"\\\"IT_0\\\"\", \"context_1\": \"\\\"Improvvisamente il padre di famiglia: \\\\\\\"Tesoro, a...\", \"context_2\": \"\\\"C'\\\\u00e8 un progetto per Milano, a Segrate, per u...\", \"target_word\": \"\\\"centro commerciale\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"58\", \"target_word_location_1.char_end\": \"76\", \"target_word_location_2.char_start\": \"46\", \"target_word_location_2.char_end\": \"64\", \"language\": \"\\\"IT\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_fr_fr": {"config_name": "xlwic_fr_fr", "sample_row": "{\"id\": \"\\\"FR_0\\\"\", \"context_1\": \"\\\"Comme l'indique le Shuowen Jiezi (dans son commen...\", \"context_2\": \"\\\"D\\\\u2019apr\\\\u00e8s le dictionnaire \\\\u00e9tymologiq...\", \"target_word\": \"\\\"ShuoWen\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"19\", \"target_word_location_1.char_end\": \"26\", \"target_word_location_2.char_start\": \"37\", \"target_word_location_2.char_end\": \"44\", \"language\": \"\\\"FR\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_de_de": {"config_name": "xlwic_de_de", "sample_row": "{\"id\": \"\\\"DE_0\\\"\", \"context_1\": \"\\\"Herr Starke wollte uns kein Interview geben.\\\"\", \"context_2\": \"\\\"Das kann ich dir aber sagen: Wenn die Frau Starke...\", \"target_word\": \"\\\"Starke\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"5\", \"target_word_location_1.char_end\": \"11\", \"target_word_location_2.char_start\": \"43\", \"target_word_location_2.char_end\": \"49\", \"language\": \"\\\"DE\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:bg", "language:zh", "language:hr", "language:da", "language:nl", "language:et", "language:fa", "language:ja", "language:ko", "language:it", "language:fr", "language:de"], "is_gated": false}, "peixian/equity_evaluation_corpus": {"dataset_name": "peixian/equity_evaluation_corpus", "description": "Automatic machine learning systems can inadvertently accentuate and perpetuate inappropriate human biases. Past work on examining inappropriate biases has largely focused on just individual systems and resources. Further, there is a lack of benchmark datasets for examining inappropriate biases in system predictions. Here, we present the Equity Evaluation Corpus (EEC), which consists of 8,640 English sentences carefully chosen to tease out biases towards certain races and genders. We used the dataset to examine 219 automatic sentiment analysis systems that took part in a recent shared task, SemEval-2018 Task 1 \u2018Affect in Tweets\u2019. We found that several of the systems showed statistically significant bias; that is, they consistently provide slightly higher sentiment intensity predictions for one race or one gender. We make the EEC freely available, and encourage its use to evaluate biases in sentiment and other NLP tasks.", "downloads": 35, "configs": {"first_domain": {"config_name": "first_domain", "sample_row": "{\"sentence\": \"\\\"Alonzo feels angry.\\\"\", \"template\": \"\\\" feels .\\\"\", \"person\": \"\\\"Alonzo\\\"\", \"gender\": \"\\\"male\\\"\", \"race\": \"\\\"African-American\\\"\", \"emotion\": \"\\\"anger\\\"\", \"emotion word\": \"\\\"angry\\\"\"}", "columns": ["sentence", "template", "person", "gender", "race", "emotion", "emotion word"], "columns_mapping": {"sentence": "sentence", "template": "template", "person": "person", "gender": "gender", "race": "race", "emotion": "emotion", "emotion word": "emotion word"}, "dataset_description": "Automatic machine learning systems can inadvertently accentuate and perpetuate inappropriate human biases. Past work on examining inappropriate biases has largely focused on just individual systems and resources. Further, there is a lack of benchmark datasets for examining inappropriate biases in system predictions. Here, we present the Equity Evaluation Corpus (EEC), which consists of 8,640 English sentences carefully chosen to tease out biases towards certain races and genders. We used the dataset to examine 219 automatic sentiment analysis systems that took part in a recent shared task, SemEval-2018 Task 1 \u2018Affect in Tweets\u2019. We found that several of the systems showed statistically significant bias; that is, they consistently provide slightly higher sentiment intensity predictions for one race or one gender. We make the EEC freely available, and encourage its use to evaluate biases in sentiment and other NLP tasks.\n", "dataset_name": "peixian/equity_evaluation_corpus"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "gender-classification"], "is_gated": false}, "persiannlp/parsinlu_entailment": {"dataset_name": "persiannlp/parsinlu_entailment", "description": "A Persian textual entailment task (deciding `sent1` entails `sent2`).", "downloads": 56, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"sent1\": \"\\\"\\\\u0632\\\\u0646\\\\u0627\\\\u0646 \\\\u0628\\\\u0647 \\\\u0642\\\\u062...\", \"sent2\": \"\\\"\\\\u0645\\\\u0631\\\\u062f\\\\u0627\\\\u0646 \\\\u0628\\\\u062e\\\\u0634...\", \"category\": \"\\\"translation-train\\\"\", \"label\": \"\\\"c\\\"\"}", "columns": ["sent1", "sent2", "category", "label"], "columns_mapping": {"sent1": "sent1", "sent2": "sent2", "category": "category", "label": "label"}, "dataset_description": "A Persian textual entailment task (deciding `sent1` entails `sent2`). \n", "dataset_name": "persiannlp/parsinlu_entailment"}}, "tags": ["task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|translated|mnli", "language:fa"], "is_gated": false}, "persiannlp/parsinlu_query_paraphrasing": {"dataset_name": "persiannlp/parsinlu_query_paraphrasing", "description": "A Persian query paraphrasing task (paraphrase or not, given two questions). \nThe questions are partly mined using Google auto-complete, and partly translated from Quora paraphrasing dataset.", "downloads": 20, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"q1\": \"\\\"\\\\u0686\\\\u06af\\\\u0648\\\\u0646\\\\u0647 \\\\u0645\\\\u06cc \\\\u062...\", \"q2\": \"\\\"\\\\u0686\\\\u06af\\\\u0648\\\\u0646\\\\u0647 \\\\u0648\\\\u0632\\\\u0646...\", \"category\": \"\\\"qqp\\\"\", \"label\": \"\\\"1\\\"\"}", "columns": ["q1", "q2", "category", "label"], "columns_mapping": {"q1": "q1", "q2": "q2", "category": "category", "label": "label"}, "dataset_description": "A Persian query paraphrasing task (paraphrase or not, given two questions). \nThe questions are partly mined using Google auto-complete, and partly translated from Quora paraphrasing dataset. \n", "dataset_name": "persiannlp/parsinlu_query_paraphrasing"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|quora|google", "language:fa"], "is_gated": false}, "persiannlp/parsinlu_sentiment": {"dataset_name": "persiannlp/parsinlu_sentiment", "description": "A Persian sentiment analysis task (deciding whether a given sentence contains a particular sentiment).", "downloads": 61, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"review\": \"\\\"\\\\u062f\\\\u0648\\\\u0633\\\\u062a\\\\u0627\\\\u0646 \\\\u062d\\\\u062a...\", \"review_id\": \"\\\"1\\\"\", \"example_id\": \"\\\"1\\\"\", \"excel_id\": \"\\\"food_1744\\\"\", \"question\": \"\\\"\\\\u0646\\\\u0638\\\\u0631 \\\\u0634\\\\u0645\\\\u0627 \\\\u062f\\\\u063...\", \"category\": \"\\\"\\\\u06af\\\\u0648\\\\u0634\\\\u062a \\\\u0645\\\\u0631\\\\u063a\\\"\", \"aspect\": \"\\\"\\\\u0637\\\\u0639\\\\u0645\\\"\", \"label\": \"\\\"-3\\\"\", \"guid\": \"\\\"food-train-r1-e1\\\"\"}", "columns": ["review", "review_id", "example_id", "excel_id", "question", "category", "aspect", "label", "guid"], "columns_mapping": {"review": "review", "review_id": "review_id", "example_id": "example_id", "excel_id": "excel_id", "question": "question", "category": "category", "aspect": "aspect", "label": "label", "guid": "guid"}, "dataset_description": "A Persian sentiment analysis task (deciding whether a given sentence contains a particular sentiment). \n", "dataset_name": "persiannlp/parsinlu_sentiment"}}, "tags": ["task_ids:sentiment-analysis", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|translated|mnli", "language:fa"], "is_gated": false}, "projecte-aina/ancora-ca-ner": {"dataset_name": "projecte-aina/ancora-ca-ner", "description": "AnCora Catalan NER.\n This is a dataset for Named Eentity Reacognition (NER) from Ancora corpus adapted for \n Machine Learning and Language Model evaluation purposes.\n Since multiwords (including Named Entites) in the original Ancora corpus are aggregated as \n a single lexical item using underscores (e.g. \"Ajuntament_de_Barcelona\") \n we splitted them to align with word-per-line format, and added conventional Begin-Inside-Outside (IOB)\n tags to mark and classify Named Entites. \n We did not filter out the different categories of NEs from Ancora (weak and strong). \n We did 6 minor edits by hand.\n AnCora corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).", "downloads": 42, "configs": {"AncoraCaNer": {"config_name": "AncoraCaNer", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Una\\\", \\\"setantena\\\", \\\"de\\\", \\\"treballadors\\\", \\\"de\\\", \\\"...\", \"ner_tags\": \"[8, 8, 8, 8, 8, 2, 6, 6, 6, 6, 8, 0, 8, 8, 8, 8, 8...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "AnCora Catalan NER.\n This is a dataset for Named Eentity Reacognition (NER) from Ancora corpus adapted for \n Machine Learning and Language Model evaluation purposes.\n Since multiwords (including Named Entites) in the original Ancora corpus are aggregated as \n a single lexical item using underscores (e.g. \"Ajuntament_de_Barcelona\") \n we splitted them to align with word-per-line format, and added conventional Begin-Inside-Outside (IOB)\n tags to mark and classify Named Entites. \n We did not filter out the different categories of NEs from Ancora (weak and strong). \n We did 6 minor edits by hand.\n AnCora corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).\n ", "dataset_name": "projecte-aina/ancora-ca-ner"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "language:ca"], "is_gated": false}, "projecte-aina/casum": {"dataset_name": "projecte-aina/casum", "description": "CaSum is a summarization dataset. It is extracted from a newswire corpus crawled from the Catalan News Agency. The corpus consists of 217,735 instances that are composed by the headline and the body.", "downloads": 50, "configs": {"CaSum": {"config_name": "CaSum", "sample_row": "{\"summary\": \"\\\"El Govern convoca eleccions a la presid\\\\u00e8ncia...\", \"text\": \"\\\"El Govern ha aprovat aquest dimarts el decret pel...\"}", "columns": ["summary", "text"], "columns_mapping": {"summary": "summary", "text": "text"}, "dataset_description": "CaSum is a summarization dataset. It is extracted from a newswire corpus crawled from the Catalan News Agency. The corpus consists of 217,735 instances that are composed by the headline and the body.\n", "dataset_name": "projecte-aina/casum"}}, "tags": ["task_categories:summarization", "annotations_creators:machine-generated", "multilinguality:monolingual", "language:ca"], "is_gated": false}, "projecte-aina/viquiquad": {"dataset_name": "projecte-aina/viquiquad", "description": "ViquiQuAD: an extractive QA dataset from Catalan Wikipedia.\nThis dataset contains 3111 contexts extracted from a set of 597 high quality original (no translations) \narticles in the Catalan Wikipedia \"Viquip\u00e8dia\" (ca.wikipedia.org), and 1 to 5 questions with their\nanswer for each fragment. Viquipedia articles are used under CC-by-sa licence. \nThis dataset can be used to build extractive-QA and Language Models.\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).", "downloads": 69, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"P_66_C_391_Q1\\\"\", \"title\": \"\\\"Xavier Miserachs i Ribalta\\\"\", \"context\": \"\\\"En aquesta \\\\u00e8poca es va consolidar el concept...\", \"question\": \"\\\"De qu\\\\u00e8 es diferenciava el reportatge fotogr\\\\...\", \"answers\": \"[{\\\"text\\\": \\\"del fotoperiodisme[n. 2] i de la fotogr...\"}", "columns": ["id", "title", "context", "question", "answers"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers": "answers"}, "dataset_description": "ViquiQuAD: an extractive QA dataset from Catalan Wikipedia.\nThis dataset contains 3111 contexts extracted from a set of 597 high quality original (no translations) \narticles in the Catalan Wikipedia \"Viquip\u00e8dia\" (ca.wikipedia.org), and 1 to 5 questions with their\nanswer for each fragment. Viquipedia articles are used under CC-by-sa licence. \nThis dataset can be used to build extractive-QA and Language Models.\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).\n", "dataset_name": "projecte-aina/viquiquad"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ca"], "is_gated": false}, "projecte-aina/wnli-ca": {"dataset_name": "projecte-aina/wnli-ca", "description": "professional translation into Catalan of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "downloads": 22, "configs": {"winograd": {"config_name": "winograd", "sample_row": "{\"sentence1\": \"\\\"Vaig clavar una agulla en una pastanaga. Quan la ...\", \"sentence2\": \"\\\"La pastanaga tenia un forat.\\\"\", \"label\": \"1\"}", "columns": ["sentence1", "sentence2", "label"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\n professional translation into Catalan of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).\n ", "dataset_name": "projecte-aina/wnli-ca"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|glue", "language:ca"], "is_gated": false}, "qanastek/WMT-16-PubMed": {"dataset_name": "qanastek/WMT-16-PubMed", "description": "WMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html", "downloads": 27, "configs": {"en-pt": {"config_name": "en-pt", "sample_row": "{\"translation.en\": \"\\\"Inequalities in self-rated health: an analysis of...\", \"translation.pt\": \"\\\"ERRATA\\\"\"}", "columns": ["translation_en", "translation_pt"], "columns_mapping": {"translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"Cruising and e-dates: a new context for sexual en...\", \"translation.es\": \"\\\"Cruising y e-citas: un nuevo contexto para los en...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"Global Health: Where Do Physiotherapy and Rehabil...\", \"translation.fr\": \"\\\"La place des cheveux et des poils dans les rituel...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "qwant/squad_fr": {"dataset_name": "qwant/squad_fr", "description": "SQuAD-fr is a French translated version of the Stanford Question Answering Dataset (SQuAD), the reference corpus to evaluate question answering models' performances in English.\nIt consists of 100K question-answer pairs on 500+ articles derived from the original English dataset and represents a large-scale dataset for closed-domain question answering on factoid questions in French.\nSQuAD-fr serves as a means of data augmentation on FQuAD and PIAF benchmarks, with 90K+ translated training pairs.", "downloads": 125, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"572ed956c246551400ce471c\\\"\", \"title\": \"\\\"Transistor\\\"\", \"context\": \"\\\"Un transistor est un dispositif semi-conducteur u...\", \"question\": \"\\\"Quelle est l'utilisation d'un transistor ?\\\"\", \"answers.text\": \"[\\\"amplifier ou commuter les signaux \\\\u00e9lectroni...\", \"answers.answer_start\": \"[61]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "SQuAD-fr is a French translated version of the Stanford Question Answering Dataset (SQuAD), the reference corpus to evaluate question answering models' performances in English.\nIt consists of 100K question-answer pairs on 500+ articles derived from the original English dataset and represents a large-scale dataset for closed-domain question answering on factoid questions in French.\nSQuAD-fr serves as a means of data augmentation on FQuAD and PIAF benchmarks, with 90K+ translated training pairs.\n", "dataset_name": "qwant/squad_fr"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:closed-domain-qa", "annotations_creators:machine-generated", "multilinguality:monolingual", "multilinguality:translation", "source_datasets:extended|squad", "language:fr"], "is_gated": false}, "ramybaly/conll2012": {"dataset_name": "ramybaly/conll2012", "description": "The CoNLL-2012 shared task involved predicting coreference in English, Chinese, and Arabic, using the final version, v5.0,\nof the OntoNotes corpus. It was a follow-on to the English-only task organized in 2011. Until the creation of the OntoNotes\ncorpus, resources in this sub-field of language processing were limited to noun phrase coreference, often on a restricted\nset of entities, such as the ACE entities. OntoNotes provides a large-scale corpus of general anaphoric coreference not\nrestricted to noun phrases or to a specified set of entity types, and covers multiple languages. OntoNotes also provides\nadditional layers of integrated annotation, capturing additional shallow semantic structure. This paper describes the\nOntoNotes annotation (coreference and other layers) and then describes the parameters of the shared task including the\nformat, pre-processing information, evaluation criteria, and presents and discusses the results achieved by the participating\nsystems. The task of coreference has had a complex evaluation history. Potentially many evaluation conditions, have, in the past,\nmade it difficult to judge the improvement in new algorithms over previously reported results. Having a standard test set\nand standard evaluation parameters, all based on a resource that provides multiple integrated annotation layers (syntactic\nparses, semantic roles, word senses, named entities and coreference) and in multiple languages could support joint modeling\nand help ground and energize ongoing research in the task of entity and event coreference.\nFor more details see https://aclanthology.org/W12-4501.pdf", "downloads": 10, "configs": {"conll2012": {"config_name": "conll2012", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Big\\\", \\\"Managers\\\", \\\"on\\\", \\\"Campus\\\"]\", \"pos_tags\": \"[17, 26, 16, 23]\", \"tags\": \"[0, 0, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "tags": "tags"}, "dataset_description": "The CoNLL-2012 shared task involved predicting coreference in English, Chinese, and Arabic, using the final version, v5.0,\nof the OntoNotes corpus. It was a follow-on to the English-only task organized in 2011. Until the creation of the OntoNotes\ncorpus, resources in this sub-field of language processing were limited to noun phrase coreference, often on a restricted\nset of entities, such as the ACE entities. OntoNotes provides a large-scale corpus of general anaphoric coreference not\nrestricted to noun phrases or to a specified set of entity types, and covers multiple languages. OntoNotes also provides\nadditional layers of integrated annotation, capturing additional shallow semantic structure. This paper describes the\nOntoNotes annotation (coreference and other layers) and then describes the parameters of the shared task including the\nformat, pre-processing information, evaluation criteria, and presents and discusses the results achieved by the participating\nsystems. The task of coreference has had a complex evaluation history. Potentially many evaluation conditions, have, in the past,\nmade it difficult to judge the improvement in new algorithms over previously reported results. Having a standard test set\nand standard evaluation parameters, all based on a resource that provides multiple integrated annotation layers (syntactic\nparses, semantic roles, word senses, named entities and coreference) and in multiple languages could support joint modeling\nand help ground and energize ongoing research in the task of entity and event coreference.\nFor more details see https://aclanthology.org/W12-4501.pdf\n", "dataset_name": "ramybaly/conll2012"}}, "tags": [], "is_gated": false}, "sagteam/author_profiling": {"dataset_name": "sagteam/author_profiling", "description": "he corpus for the author profiling analysis contains texts in Russian-language which labeled for 5 tasks:\n1) gender -- 13530 texts with the labels, who wrote this: text female or male;\n2) age -- 13530 texts with the labels, how old the person who wrote the text. This is a number from 12 to 80. In addition, for the classification task we added 5 age groups: 1-19; 20-29; 30-39; 40-49; 50+;\n3) age imitation -- 7574 texts, where crowdsource authors is asked to write three texts: \n a) in their natural manner, \n b) imitating the style of someone younger, \n c) imitating the style of someone older;\n4) gender imitation -- 5956 texts, where the crowdsource authors is asked to write texts: in their origin gender and pretending to be the opposite gender;\n5) style imitation -- 5956 texts, where crowdsource authors is asked to write a text on behalf of another person of your own gender, with a distortion of the authors usual style.", "downloads": 14, "configs": {"main": {"config_name": "main", "sample_row": "{\"id\": \"\\\"crowdsource_4\\\"\", \"text\": \"\\\"\\\\u0437\\\\u0434\\\\u0440\\\\u0430\\\\u0432\\\\u0441\\\\u0442\\\\u0432\\\\...\", \"account_id\": \"\\\"account_#1009\\\"\", \"author_id\": \"2\", \"age\": \"21\", \"age_group\": \"\\\"20-29\\\"\", \"gender\": \"\\\"male\\\"\", \"no_imitation\": \"\\\"no_any_imitation\\\"\", \"age_imitation\": \"\\\"None\\\"\", \"gender_imitation\": \"\\\"no_gender_imitation\\\"\", \"style_imitation\": \"\\\"no_style_imitation\\\"\"}", "columns": ["id", "text", "account_id", "author_id", "age", "age_group", "gender", "no_imitation", "age_imitation", "gender_imitation", "style_imitation"], "columns_mapping": {"id": "id", "text": "text", "account_id": "account_id", "author_id": "author_id", "age": "age", "age_group": "age_group", "gender": "gender", "no_imitation": "no_imitation", "age_imitation": "age_imitation", "gender_imitation": "gender_imitation", "style_imitation": "style_imitation"}, "dataset_description": "he corpus for the author profiling analysis contains texts in Russian-language which labeled for 5 tasks:\n1) gender -- 13530 texts with the labels, who wrote this: text female or male;\n2) age -- 13530 texts with the labels, how old the person who wrote the text. This is a number from 12 to 80. In addition, for the classification task we added 5 age groups: 1-19; 20-29; 30-39; 40-49; 50+;\n3) age imitation -- 7574 texts, where crowdsource authors is asked to write three texts: \n a) in their natural manner, \n b) imitating the style of someone younger, \n c) imitating the style of someone older;\n4) gender imitation -- 5956 texts, where the crowdsource authors is asked to write texts: in their origin gender and pretending to be the opposite gender;\n5) style imitation -- 5956 texts, where crowdsource authors is asked to write a text on behalf of another person of your own gender, with a distortion of the authors usual style.\n", "dataset_name": "sagteam/author_profiling"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru"], "is_gated": false}, "toloka/VoxDIY-RusNews": {"dataset_name": "toloka/VoxDIY-RusNews", "description": "VoxDIY: Benchmark Dataset for Russian Crowdsourced Audio Transcription.", "downloads": 29, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"task\": \"\\\"https://tlk.s3.yandex.net/annotation_tasks/russia...\", \"transcriptions\": \"\\\"\\\\u044d\\\\u0442\\\\u043e \\\\u0432\\\\u0438\\\\u0434\\\\u0438\\\\u043c...\", \"performers\": \"\\\"8 | 3200 | 3058 | 2702 | 2763 | 953 | 1573\\\"\", \"gt\": \"\\\"\\\\u044d\\\\u0442\\\\u043e \\\\u0432\\\\u0438\\\\u0434\\\\u0438\\\\u043c...\"}", "columns": ["task", "transcriptions", "performers", "gt"], "columns_mapping": {"task": "task", "transcriptions": "transcriptions", "performers": "performers", "gt": "gt"}, "dataset_description": "VoxDIY: Benchmark Dataset for Russian Crowdsourced Audio Transcription.\n", "dataset_name": "toloka/VoxDIY-RusNews"}}, "tags": ["task_categories:summarization", "task_categories:automatic-speech-recognition", "task_categories:text2text-generation", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ru", "conditional-text-generation", "stuctured-to-text", "speech-recognition"], "is_gated": false}, "usc-isi/WikiConvert": {"dataset_name": "usc-isi/WikiConvert", "description": "Language Modelling with Cardinal Number Annotations.", "downloads": 93, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"0\", \"UNIQUE_STORY_INDEX\": \"\\\"0\\\"\", \"offset\": \"16\", \"length\": \"4\", \"magnitude\": \"0\", \"comment\": \"\\\"With a total of 1500 miles of inland waterways, A...\", \"number\": \"1500\"}", "columns": ["id", "UNIQUE_STORY_INDEX", "offset", "length", "magnitude", "comment", "number"], "columns_mapping": {"id": "id", "UNIQUE_STORY_INDEX": "UNIQUE_STORY_INDEX", "offset": "offset", "length": "length", "magnitude": "magnitude", "comment": "comment", "number": "number"}, "dataset_description": "Language Modelling with Cardinal Number Annotations.\n", "dataset_name": "usc-isi/WikiConvert"}}, "tags": ["task_categories:fill-mask", "task_categories:other", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:en", "numeracy", "natural-language-understanding", "tokenization"], "is_gated": false}, "w11wo/imdb-javanese": {"dataset_name": "w11wo/imdb-javanese", "description": "Large Movie Review Dataset translated to Javanese.\r\nThis is a dataset for binary sentiment classification containing substantially\r\nmore data than previous benchmark datasets. We provide a set of 25,000 highly\r\npolar movie reviews for training, and 25,000 for testing. There is additional\r\nunlabeled data for use as well. We translated the original IMDB Dataset to\r\nJavanese using the multi-lingual MarianMT Transformer model from\r\n`Helsinki-NLP/opus-mt-en-mul`.", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Bromwell High's komedia kom\\\\u00e9dia. Kuwi mlaku ...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "\nLarge Movie Review Dataset translated to Javanese.\nThis is a dataset for binary sentiment classification containing substantially\nmore data than previous benchmark datasets. We provide a set of 25,000 highly\npolar movie reviews for training, and 25,000 for testing. There is additional\nunlabeled data for use as well. We translated the original IMDB Dataset to\nJavanese using the multi-lingual MarianMT Transformer model from\n`Helsinki-NLP/opus-mt-en-mul`. \n", "dataset_name": "w11wo/imdb-javanese"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:jv"], "is_gated": false}, "yhavinga/mc4_nl_cleaned": {"dataset_name": "yhavinga/mc4_nl_cleaned", "description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 322, "configs": {"micro": {"config_name": "micro", "sample_row": "{\"text\": \"\\\"Japanse bedrijven zijn niet alleen hondstrouw aan...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"De Engelstalige databank van ChemExper kan rechts...\", \"timestamp\": \"\\\"2017-03-24T00:03:21Z\\\"\", \"url\": \"\\\"http://gevaarlijkestoffen.be/databank/externedb.h...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"De gemeente Maassluis heeft een verzekering afges...\", \"timestamp\": \"\\\"2018-05-22T11:47:36Z\\\"\", \"url\": \"\\\"https://www.prinshendrik-maassluis.nl/vrijwillige...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Joseph Beuys - Museum Schloss Moyland Zur Suche.Z...\", \"timestamp\": \"\\\"2017-06-23T01:44:23Z\\\"\", \"url\": \"\\\"http://www.moyland.de/nl/tentoonstellingen/joseph...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"1. Wil ING het Duitse Commerzbank overnemen?\\\\nVol...\", \"timestamp\": \"\\\"2019-08-23T00:08:17Z\\\"\", \"url\": \"\\\"https://www.mt.nl/nieuws/7-van-mt/wil-ing-commerz...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Het is vrijdag avond half 6 en ik ga onderweg van...\", \"timestamp\": \"\\\"2020-06-04T16:28:02Z\\\"\", \"url\": \"\\\"https://sandertuinhof.com/maurtenclinic/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "micro_en_nl": {"config_name": "micro_en_nl", "sample_row": "{\"text\": \"\\\"Japanse bedrijven zijn niet alleen hondstrouw aan...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "tiny_en_nl": {"config_name": "tiny_en_nl", "sample_row": "{\"text\": \"\\\"IBB owner Shay Geyer greets guests at the IBB Day...\", \"timestamp\": \"\\\"2019-04-24T03:59:15Z\\\"\", \"url\": \"\\\"https://candysdirt.com/category/stage-me/page/2/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "small_en_nl": {"config_name": "small_en_nl", "sample_row": "{\"text\": \"\\\"Deze milkshake is ontzettend vezelrijk door de ba...\", \"timestamp\": \"\\\"2017-03-30T12:41:30Z\\\"\", \"url\": \"\\\"http://perfecthousewife2b.nl/gezonde-chocolade-mi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "medium_en_nl": {"config_name": "medium_en_nl", "sample_row": "{\"text\": \"\\\"Tja je weet wel hoe dat gaat.. op een dag ben je ...\", \"timestamp\": \"\\\"2018-07-21T19:25:07Z\\\"\", \"url\": \"\\\"http://alle-mooie-dingen.blogspot.com/2011/07/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "large_en_nl": {"config_name": "large_en_nl", "sample_row": "{\"text\": \"\\\"N.B. Automatische incasso is een voorwaarde om li...\", \"timestamp\": \"\\\"2018-02-23T02:16:17Z\\\"\", \"url\": \"\\\"https://www.wsv-vada.nl/aanmelden\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "full_en_nl": {"config_name": "full_en_nl", "sample_row": "{\"text\": \"\\\"Charles is instrumental in the ESG assessment and...\", \"timestamp\": \"\\\"2019-04-26T04:32:46Z\\\"\", \"url\": \"\\\"https://riaconference.ca/speaker/charles-van-thie...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "multilinguality:en-nl", "source_datasets:extended", "language:nl", "language:en"], "is_gated": false}, "yuanchuan/annotated_reference_strings": {"dataset_name": "yuanchuan/annotated_reference_strings", "description": "A repository of reference strings annotated using CSL processor using citations obtained from various sources.", "downloads": 12, "configs": {"default": {"config_name": "default", "sample_row": "{\"source\": \"\\\"crossref\\\"\", \"lang\": \"\\\"en\\\"\", \"entry_type\": \"\\\"article\\\"\", \"doi_prefix\": \"\\\"10.1021\\\"\", \"csl_style\": \"\\\"nature\\\"\", \"content\": \"\\\"1. She...\"}", "columns": ["source", "lang", "entry_type", "doi_prefix", "csl_style", "content"], "columns_mapping": {"source": "source", "lang": "lang", "entry_type": "entry_type", "doi_prefix": "doi_prefix", "csl_style": "csl_style", "content": "content"}, "dataset_description": "A repository of reference strings annotated using CSL processor using citations obtained from various sources.\n", "dataset_name": "yuanchuan/annotated_reference_strings"}}, "tags": ["task_categories:token-classification", "task_ids:parsing", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "nlpaueb/finer-139": {"dataset_name": "nlpaueb/finer-139", "description": "FiNER-139 is a named entity recognition dataset consisting of 10K annual \nand quarterly English reports (filings) of publicly traded companies \ndownloaded from the U.S. Securities and Exchange Commission (SEC) \nannotated with 139 XBRL tags in the IOB2 format.", "downloads": 213, "configs": {"finer-139": {"config_name": "finer-139", "sample_row": "{\"id\": \"0\", \"tokens\": \"[\\\"ITEM\\\", \\\"1\\\", \\\"Financial\\\", \\\"Statements\\\", \\\"Lennar\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nFiNER-139 is a named entity recognition dataset consisting of 10K annual \nand quarterly English reports (filings) of publicly traded companies \ndownloaded from the U.S. Securities and Exchange Commission (SEC) \nannotated with 139 XBRL tags in the IOB2 format.\n", "dataset_name": "nlpaueb/finer-139"}}, "tags": ["task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "ruanchaves/snap": {"dataset_name": "ruanchaves/snap", "description": "Automatically segmented 803K SNAP Twitter Data Set hashtags with the heuristic described in the paper \"Segmenting hashtags using automatically created training data\".", "downloads": 10, "configs": {"default": {"config_name": "default", "sample_row": "{\"index\": \"0\", \"hashtag\": \"\\\"BrandThunder\\\"\", \"segmentation\": \"\\\"Brand Thunder\\\"\"}", "columns": ["index", "hashtag", "segmentation"], "columns_mapping": {"index": "index", "hashtag": "hashtag", "segmentation": "segmentation"}, "dataset_description": "\nAutomatically segmented 803K SNAP Twitter Data Set hashtags with the heuristic described in the paper \"Segmenting hashtags using automatically created training data\".\n", "dataset_name": "ruanchaves/snap"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "word-segmentation"], "is_gated": false}, "CLUTRR/v1": {"dataset_name": "CLUTRR/v1", "description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.", "downloads": 1686, "configs": {"gen_train23_test2to10": {"config_name": "gen_train23_test2to10", "sample_row": "{\"id\": \"\\\"f4161421-bf6e-4165-9133-07f1dcc4c87e\\\"\", \"story\": \"\\\"[Dorothy]'s brother [Michael] and her went to get...\", \"query\": \"\\\"('Donald', 'Dorothy')\\\"\", \"target\": \"0\", \"target_text\": \"\\\"aunt\\\"\", \"clean_story\": \"\\\"[Michael] is the proud father of the lovely [Dona...\", \"proof_state\": \"\\\"[{('Donald', 'aunt', 'Dorothy'): [('Donald', 'fat...\", \"f_comb\": \"\\\"father-sister\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['father', 'sister']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Donald:male,Michael:male,Dorothy:female\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "gen_train234_test2to10": {"config_name": "gen_train234_test2to10", "sample_row": "{\"id\": \"\\\"0fc660c1-e7d5-41fb-8d72-d2beb2c8d2ef\\\"\", \"story\": \"\\\"[Ashley]'s daughter, [Lillian], asked her mom to ...\", \"query\": \"\\\"('Ashley', 'Nicholas')\\\"\", \"target\": \"15\", \"target_text\": \"\\\"son\\\"\", \"clean_story\": \"\\\"[Ashley]'s daughter, [Lillian], asked her mom to ...\", \"proof_state\": \"\\\"[{('Ashley', 'son', 'Nicholas'): [('Ashley', 'dau...\", \"f_comb\": \"\\\"daughter-brother\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['daughter', 'brother']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Ashley:female,Lillian:female,Nicholas:male\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_clean_23_test_all_23": {"config_name": "rob_train_clean_23_test_all_23", "sample_row": "{\"id\": \"\\\"00eabf71-62a0-446b-9c7d-df3896af0eb2\\\"\", \"story\": \"\\\"[Herman] asked his son, [James], to go grocery sh...\", \"query\": \"\\\"('Herman', 'Rosalee')\\\"\", \"target\": \"9\", \"target_text\": \"\\\"daughter-in-law\\\"\", \"clean_story\": \"\\\"[Herman] asked his son, [James], to go grocery sh...\", \"proof_state\": \"\\\"[{('Herman', 'daughter-in-law', 'Rosalee'): [('He...\", \"f_comb\": \"\\\"son-wife\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['son', 'wife']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Herman:male,James:male,Rosalee:female\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_disc_23_test_all_23": {"config_name": "rob_train_disc_23_test_all_23", "sample_row": "{\"id\": \"\\\"e37f6b88-e6e6-403d-a9b3-efb50c2a2044\\\"\", \"story\": \"\\\"[Kathleen] and her son-in-law [William] went to v...\", \"query\": \"\\\"('James', 'John')\\\"\", \"target\": \"3\", \"target_text\": \"\\\"brother\\\"\", \"clean_story\": \"\\\"[Kathryn] likes baking brownies for her son [John...\", \"proof_state\": \"\\\"[{('James', 'brother', 'John'): [('James', 'mothe...\", \"f_comb\": \"\\\"mother-son\\\"\", \"task_name\": \"\\\"task_4.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (3, 4)]\\\"\", \"edge_types\": \"\\\"['mother', 'son']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"James:male,Kathryn:female,John:male,Kathleen:fema...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_irr_23_test_all_23": {"config_name": "rob_train_irr_23_test_all_23", "sample_row": "{\"id\": \"\\\"3e868d06-78e4-47ec-9272-4538d95214d4\\\"\", \"story\": \"\\\"[Brian] is one of [Geraldine]'s brothers. They ha...\", \"query\": \"\\\"('Geraldine', 'Nancy')\\\"\", \"target\": \"0\", \"target_text\": \"\\\"aunt\\\"\", \"clean_story\": \"\\\"[Preston] took his daughter [Geraldine] to ballet...\", \"proof_state\": \"\\\"[{('Geraldine', 'aunt', 'Nancy'): [('Geraldine', ...\", \"f_comb\": \"\\\"father-sister\\\"\", \"task_name\": \"\\\"task_3.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (0, 3)]\\\"\", \"edge_types\": \"\\\"['father', 'sister']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Geraldine:female,Preston:male,Nancy:female,Brian:...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_sup_23_test_all_23": {"config_name": "rob_train_sup_23_test_all_23", "sample_row": "{\"id\": \"\\\"2ece8482-1db7-4395-9cb1-fa472b041e7d\\\"\", \"story\": \"\\\"[Darnell] loved his mother, [Theresa]. [Theresa] ...\", \"query\": \"\\\"('Amanda', 'Michelle')\\\"\", \"target\": \"4\", \"target_text\": \"\\\"sister\\\"\", \"clean_story\": \"\\\"[Theresa] was so proud of her daughter [Amanda] f...\", \"proof_state\": \"\\\"[{('Amanda', 'sister', 'Michelle'): [('Amanda', '...\", \"f_comb\": \"\\\"mother-daughter\\\"\", \"task_name\": \"\\\"task_2.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (1, 3), (3, 2)]\\\"\", \"edge_types\": \"\\\"['mother', 'daughter']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Amanda:female,Theresa:female,Michelle:female,Darn...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "ai4bharat/IndicHeadlineGeneration": {"dataset_name": "ai4bharat/IndicHeadlineGeneration", "description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.", "downloads": 33, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09b7\\\\u09df \\\\u09a8\\\\u09bf\\\\u09b0\\\\u09cd...\", \"target\": \"\\\"\\\\u09a8\\\\u09bf\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09be\\\\u099a\\\\u09a8 ...\", \"url\": \"\\\"https://bengali.oneindia.com/topic/%E0%A6%A8%E0%A...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u099c \\\\u098f.\\\\u098f\\\\u099b.\\\\u098f\\\\u09a8. \\\\u09aa\\\\u...\", \"target\": \"\\\"\\\\u09ac\\\\u0999\\\\u09be\\\\u0987\\\\u0997\\\\u09be\\\\u0981\\\\u09f1\\\\...\", \"url\": \"\\\"https://www.newsasn.com/index.php/node/3523\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a9c\\\\u0abe\\\\u0a82\\\\u0aac\\\\u0ac1\\\\u0aa1\\\\u0ac0\\\\u0aaf\\\\...\", \"target\": \"\\\"\\\\u0aae\\\\u0acb\\\\u0ab0\\\\u0aac\\\\u0ac0\\\\u0aa8\\\\u0abe \\\\u0ab8...\", \"url\": \"\\\"http://abtakmedia.com/seven-villages-of-morbi-are...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0905\\\\u092e\\\\u0947...\", \"target\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0908\\\\u0930\\\\u093e...\", \"url\": \"null\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0cc0\\\\...\", \"target\": \"\\\"\\\\u0cb0\\\\u0cab\\\\u0cc7\\\\u0cb2\\\\u0ccd \\\\u0c96\\\\u0cb0\\\\u0cc0...\", \"url\": \"\\\"https://vknews.in/352131/?responsive=false\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0d24\\\\u0d3f\\\\u0d30\\\\u0d41\\\\u0d35\\\\u0d28\\\\u0d28\\\\u0d4d\\\\...\", \"target\": \"\\\"\\\\u0d2a\\\\u0d3e\\\\u0d32\\\\u0d41\\\\u0d02 \\\\u0d15\\\\u0d1f\\\\u0d4d...\", \"url\": \"\\\"https://malayalam.oneindia.com/news/kerala/hotels...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0909\\\\u092a\\\\u092e\\\\u0941\\\\u0916\\\\u094d\\\\u092f\\\\u092e\\\\...\", \"target\": \"\\\"\\\\u092e\\\\u0924\\\\u092d\\\\u093f\\\\u0928\\\\u094d\\\\u0928\\\\u0924\\\\...\", \"url\": \"\\\"https://www.dainikprabhat.com/despite-differences...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0b13\\\\u0b21\\\\u0b3f\\\\u0b36\\\\u0b3e , \\\\u0b30\\\\u0b3e\\\\u0b...\", \"target\": \"\\\"\\\\u0b2a\\\\u0b3e\\\\u0b1f\\\\u0b15\\\\u0b41\\\\u0b30\\\\u0b3e\\\\u0b30\\\\...\", \"url\": \"\\\"http://utkalexpress.in/?p=23750\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a28\\\\u0a4b\\\\u0a2c\\\\u0a47\\\\u0a32 \\\\u0a2a\\\\u0a41\\\\u0a30...\", \"target\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a39\\\\u0a24 \\\\u0a2e\\\\u0a70\\\\u0a24\\\\u0a30...\", \"url\": \"\\\"https://newsnumber.com/news/story/157908\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0ba4\\\\u0bc2\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bc1\\\\u0b95\\\\u0bcd\\\\...\", \"target\": \"\\\"\\\\u0b85\\\\u0bae\\\\u0bc8\\\\u0b9a\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bcd ...\", \"url\": \"\\\"https://tamil.oneindia.com/news/tuticorin/manikar...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c30\\\\u0c3f 23, 2019 174 \\\\u0c0...\", \"target\": \"\\\"\\\\u0c2e\\\\u0c39\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c2f\\\\u0c15\\\\u0c41\\\\...\", \"url\": \"\\\"https://manalokam.com/cinema/ntr-mahanayakudu-fir...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original for Hindi, and modified [IndicGLUE](https://indicnlp.ai4bharat.org/indic-glue/) for other languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicSentenceSummarization": {"dataset_name": "ai4bharat/IndicSentenceSummarization", "description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.", "downloads": 77, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u09b2\\\\u0996\\\\u09a8\\\\u0989: \\\\u0989\\\\u09a4\\\\u09cd\\\\u09a...\", \"target\": \"\\\"\\\\u09ee\\\\u09ec \\\\u09b2\\\\u0995\\\\u09cd\\\\u09b7 \\\\u0995\\\\u09c...\", \"url\": \"\\\"https://bengali.abplive.com/news/nation/yogi-adit...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u099c \\\\u098f.\\\\u098f\\\\u099b.\\\\u098f\\\\u09a8. \\\\u09aa\\\\u...\", \"target\": \"\\\"\\\\u09ac\\\\u0999\\\\u09be\\\\u0987\\\\u0997\\\\u09be\\\\u0981\\\\u09f1\\\\...\", \"url\": \"\\\"https://www.newsasn.com/index.php/node/3523\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a9c\\\\u0abe\\\\u0a82\\\\u0aac\\\\u0ac1\\\\u0aa1\\\\u0ac0\\\\u0aaf\\\\...\", \"target\": \"\\\"\\\\u0aae\\\\u0acb\\\\u0ab0\\\\u0aac\\\\u0ac0\\\\u0aa8\\\\u0abe \\\\u0ab8...\", \"url\": \"\\\"http://abtakmedia.com/seven-villages-of-morbi-are...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0905\\\\u092e\\\\u0947...\", \"target\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0908\\\\u0930\\\\u093e...\", \"url\": \"null\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0cc0\\\\...\", \"target\": \"\\\"\\\\u0cb0\\\\u0cab\\\\u0cc7\\\\u0cb2\\\\u0ccd \\\\u0c96\\\\u0cb0\\\\u0cc0...\", \"url\": \"\\\"https://vknews.in/352131/?responsive=false\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0d24\\\\u0d3f\\\\u0d30\\\\u0d41\\\\u0d35\\\\u0d28\\\\u0d28\\\\u0d4d\\\\...\", \"target\": \"\\\"\\\\u0d2a\\\\u0d3e\\\\u0d32\\\\u0d41\\\\u0d02 \\\\u0d15\\\\u0d1f\\\\u0d4d...\", \"url\": \"\\\"https://malayalam.oneindia.com/news/kerala/hotels...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0909\\\\u092a\\\\u092e\\\\u0941\\\\u0916\\\\u094d\\\\u092f\\\\u092e\\\\...\", \"target\": \"\\\"\\\\u092e\\\\u0924\\\\u092d\\\\u093f\\\\u0928\\\\u094d\\\\u0928\\\\u0924\\\\...\", \"url\": \"\\\"https://www.dainikprabhat.com/despite-differences...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0b13\\\\u0b21\\\\u0b3f\\\\u0b36\\\\u0b3e , \\\\u0b30\\\\u0b3e\\\\u0b...\", \"target\": \"\\\"\\\\u0b2a\\\\u0b3e\\\\u0b1f\\\\u0b15\\\\u0b41\\\\u0b30\\\\u0b3e\\\\u0b30\\\\...\", \"url\": \"\\\"http://utkalexpress.in/?p=23750\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a28\\\\u0a4b\\\\u0a2c\\\\u0a47\\\\u0a32 \\\\u0a2a\\\\u0a41\\\\u0a30...\", \"target\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a39\\\\u0a24 \\\\u0a2e\\\\u0a70\\\\u0a24\\\\u0a30...\", \"url\": \"\\\"https://newsnumber.com/news/story/157908\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0ba4\\\\u0bc2\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bc1\\\\u0b95\\\\u0bcd\\\\...\", \"target\": \"\\\"\\\\u0b85\\\\u0bae\\\\u0bc8\\\\u0b9a\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bcd ...\", \"url\": \"\\\"https://tamil.oneindia.com/news/tuticorin/manikar...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c30\\\\u0c3f 23, 2019 174 \\\\u0c0...\", \"target\": \"\\\"\\\\u0c2e\\\\u0c39\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c2f\\\\u0c15\\\\u0c41\\\\...\", \"url\": \"\\\"https://manalokam.com/cinema/ntr-mahanayakudu-fir...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original for Hindi, and modified [IndicGLUE](https://indicnlp.ai4bharat.org/indic-glue/) for other languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicWikiBio": {"dataset_name": "ai4bharat/IndicWikiBio", "description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.", "downloads": 52, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\u09cd\\\\u09a6...\", \"serialized_infobox\": \"\\\" name \\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\...\", \"summary\": \"\\\"\\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\u09cd\\\\u09a6\\\\u09cd\\\\...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"bgcolour_1:#\\\\tbgcolour_2:6495ED\\\\tname_1:\\\\u0987\\\\u0...\", \"serialized_infobox\": \"\\\" bgcolour # 6495ED name ...\", \"summary\": \"\\\"\\\\u0987\\\\u09ae\\\\u09a6\\\\u09be\\\\u09a6 \\\\u09b9\\\\u09cb\\\\u09b8...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930\\\\tname_2:\\\\u0...\", \"serialized_infobox\": \"\\\" name \\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930 ...\", \"summary\": \"\\\"\\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930 \\\\u0930\\\\u094b\\\\u0921...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0c86\\\\u0cb0\\\\u0ccd\\\\tname_2:.\\\\tname_3:\\\\u0c8...\", \"serialized_infobox\": \"\\\" name \\\\u0c86\\\\u0cb0\\\\u0ccd . \\\\u0c8e\\\\u0c...\", \"summary\": \"\\\"\\\\u0c86\\\\u0cb0\\\\u0ccd. \\\\u0c8e\\\\u0ca8\\\\u0ccd. \\\\u0c9c\\\\u0...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0d35\\\\u0d3e\\\\u0d38\\\\u0d4d\\\\u0d35\\\\u0d4b\\\\tname...\", \"serialized_infobox\": \"\\\" name \\\\u0d35\\\\u0d3e\\\\u0d38\\\\u0d4d\\\\u0d35\\\\...\", \"summary\": \"\\\"\\\\u0d05\\\\u0d2e\\\\u0d47\\\\u0d30\\\\u0d3f\\\\u0d15\\\\u0d4d\\\\u0d15\\\\...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\u0b3e\\\\tname...\", \"serialized_infobox\": \"\\\" name \\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\...\", \"summary\": \"\\\"\\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\u0b3e \\\\u0b30\\\\u0b3e...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c\\\\tname_2:\\\\u0a2a\\\\u0...\", \"serialized_infobox\": \"\\\" name \\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c \\\\u0a2a...\", \"summary\": \"\\\"\\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c \\\\u0a2a\\\\u0a3e\\\\u0a32\\\\u0a40...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0ba8\\\\u0b9e\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bbe\\\\u0b9a...\", \"serialized_infobox\": \"\\\" name \\\\u0ba8\\\\u0b9e\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\...\", \"summary\": \"\\\"\\\\u0bae\\\\u0b95\\\\u0bbe\\\\u0bb0\\\\u0bbe\\\\u0b9a\\\\u0bbe \\\\u0b9a...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0c2f\\\\u0c02\\\\tname_2:.\\\\tname_3:\\\\u0c2f\\\\u0c0...\", \"serialized_infobox\": \"\\\" name \\\\u0c2f\\\\u0c02 . \\\\u0c2f\\\\u0c02 . \\\\...\", \"summary\": \"\\\"\\\\u0c2f\\\\u0c02. \\\\u0c2f\\\\u0c02. \\\\u0c36\\\\u0c4d\\\\u0c30\\\\u0...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:none. Originally generated from www.wikimedia.org.", "language:as", "language:bn", "language:hi", "language:kn", "language:ml", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicQuestionGeneration": {"dataset_name": "ai4bharat/IndicQuestionGeneration", "description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.", "downloads": 37, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u099a\\\\u09a8\\\\u09a4\\\"\", \"context\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u099a\\\\u09a8\\\\u09a4 \\\\u099...\", \"question\": \"\\\"\\\\u0995\\\\u09be\\\\u09f0\\\\u09cd\\\\u09b2 \\\\u09b9\\\\u09be\\\\u0987...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u098f\\\\u09b0 \\\\u09a6\\\\u09b...\", \"context\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6-\\\\u098f\\\\u09b0 \\\\u09a6\\\\u09b...\", \"question\": \"\\\"\\\\u0995\\\\u09be\\\\u09b0\\\\u09cd\\\\u09b2 \\\\u09b9\\\\u09be\\\\u0987...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0aa8\\\\u0abe \\\\u0aa6\\\\u0abe\\\\u0aaf\\\\u0a95\\\\u0abe\\\\...\", \"context\": \"\\\"1860\\\\u0aa8\\\\u0abe \\\\u0aa6\\\\u0abe\\\\u0aaf\\\\u0a95\\\\u0abe\\\\u...\", \"question\": \"\\\"\\\\u0a95\\\\u0abe\\\\u0ab0\\\\u0acd\\\\u0ab2 \\\\u0ab9\\\\u0ac7\\\\u0aa8...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u0915\\\\u0947 \\\\u0926\\\\u093...\", \"context\": \"\\\"1860 \\\\u0915\\\\u0947 \\\\u0926\\\\u0936\\\\u0915 \\\\u092e\\\\u0947...\", \"question\": \"\\\"\\\\u0915\\\\u093e\\\\u0930\\\\u094d\\\\u0932 \\\\u0939\\\\u0947\\\\u0928...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0cb0\\\\u0cb2\\\\u0ccd\\\\u0cb2\\\\u0cbf\\\"\", \"context\": \"\\\"1860\\\\u0cb0 \\\\u0ca6\\\\u0cb6\\\\u0c95\\\\u0ca6\\\\u0cb2\\\\u0ccd\\\\u...\", \"question\": \"\\\"\\\\u0c95\\\\u0cbe\\\\u0cb0\\\\u0ccd\\\\u0cb2\\\\u0ccd \\\\u0cb9\\\\u0cc6...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0d15\\\\u0d33\\\\u0d3f\\\\u0d7d\\\"\", \"context\": \"\\\"1860 \\\\u0d15\\\\u0d33\\\\u0d3f\\\\u0d7d \\\\u0d15\\\\u0d3e\\\\u0d7e ...\", \"question\": \"\\\"\\\\u0d15\\\\u0d3e\\\\u0d7e \\\\u0d39\\\\u0d46\\\\u0d7b\\\\u0d31\\\\u0d3f...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u091a\\\\u094d\\\\u092f\\\\u093e...\", \"context\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u091a\\\\u094d\\\\u092f\\\\u093e...\", \"question\": \"\\\"\\\\u0915\\\\u093e\\\\u0930\\\\u094d\\\\u0932 \\\\u0939\\\\u0947\\\\u0928...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0b67\\\\u0b6e\\\\u0b6c\\\\u0b66 \\\\u0b2e\\\\u0b38\\\\u0b3f\\\\u0b39...\", \"context\": \"\\\"\\\\u0b67\\\\u0b6e\\\\u0b6c\\\\u0b66 \\\\u0b2e\\\\u0b38\\\\u0b3f\\\\u0b39...\", \"question\": \"\\\"\\\\u0b15\\\\u0b3e\\\\u0b30\\\\u0b4d\\\\u0b32 \\\\u0b39\\\\u0b3e\\\\u0b07...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0a26\\\\u0a47 \\\\u0a26\\\\u0a39\\\\u0a3e\\\\u0a15\\\\u0a47\\\"...\", \"context\": \"\\\"\\\\u0a38\\\\u0a3c\\\\u0a41\\\\u0a30\\\\u0a42\\\\u0a06\\\\u0a24\\\\u0a40 ...\", \"question\": \"\\\"\\\\u0a15\\\\u0a3e\\\\u0a30\\\\u0a32 \\\\u0a39\\\\u0a47\\\\u0a28\\\\u0a30...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0b95\\\\u0bb3\\\\u0bbf\\\\u0bb2\\\\u0bcd\\\"\", \"context\": \"\\\"1860 \\\\u0b95\\\\u0bb3\\\\u0bbf\\\\u0bb2\\\\u0bcd \\\\u0b95\\\\u0bbe\\\\...\", \"question\": \"\\\"\\\\u0b95\\\\u0bbe\\\\u0bb0\\\\u0bcd\\\\u0bb2\\\\u0bcd \\\\u0bb9\\\\u0bc8...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860\\\\u0c32\\\\u0c32\\\\u0c4b\\\"\", \"context\": \"\\\"1860\\\\u0c32\\\\u0c32\\\\u0c4b \\\\u0c15\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u...\", \"question\": \"\\\"\\\\u0c15\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u0c32\\\\u0c4d \\\\u0c39\\\\u0c46...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:we start with the SQuAD question answering dataset repurposed to serve as a question generation dataset. We translate this dataset into different Indic languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ruanchaves/reddit_china": {"dataset_name": "ruanchaves/reddit_china", "description": "Reddit comments with the word 'China' between 2010 and 2022.", "downloads": 106, "configs": {"default": {"config_name": "default", "sample_row": "{\"author\": \"\\\"Grunt08\\\"\", \"author_fullname\": \"\\\"t2_c8dzz\\\"\", \"body\": \"\\\"They also have encrypted radios, but they're made...\", \"created_utc\": \"1646931561\", \"id\": \"\\\"i04kut4\\\"\", \"is_submitter\": \"false\", \"link_id\": \"\\\"t3_tb2v7q\\\"\", \"locked\": \"false\", \"no_follow\": \"true\", \"parent_id\": \"\\\"t1_i04j877\\\"\", \"permalink\": \"\\\"/r/ukraine/comments/tb2v7q/apparently_russian_mil...\", \"retrieved_on\": \"null\", \"score\": \"1\", \"send_replies\": \"true\", \"stickied\": \"false\", \"subreddit\": \"\\\"ukraine\\\"\", \"subreddit_id\": \"\\\"t5_2qqcn\\\"\"}", "columns": ["author", "author_fullname", "body", "created_utc", "id", "is_submitter", "link_id", "locked", "no_follow", "parent_id", "permalink", "retrieved_on", "score", "send_replies", "stickied", "subreddit", "subreddit_id"], "columns_mapping": {"author": "author", "author_fullname": "author_fullname", "body": "body", "created_utc": "created_utc", "id": "id", "is_submitter": "is_submitter", "link_id": "link_id", "locked": "locked", "no_follow": "no_follow", "parent_id": "parent_id", "permalink": "permalink", "retrieved_on": "retrieved_on", "score": "score", "send_replies": "send_replies", "stickied": "stickied", "subreddit": "subreddit", "subreddit_id": "subreddit_id"}, "dataset_description": "\nReddit comments with the word 'China' between 2010 and 2022.\n", "dataset_name": "ruanchaves/reddit_china"}}, "tags": [], "is_gated": false}, "wikitablequestions": {"dataset_name": "wikitablequestions", "description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.", "downloads": 1889, "configs": {"random-split-1": {"config_name": "random-split-1", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-2": {"config_name": "random-split-2", "sample_row": "{\"id\": \"\\\"nt-2\\\"\", \"question\": \"\\\"which team won previous to crettyard?\\\"\", \"answers\": \"[\\\"Wolfe Tones\\\"]\", \"table.header\": \"[\\\"Team\\\", \\\"County\\\", \\\"Wins\\\", \\\"Years won\\\"]\", \"table.rows\": \"[[\\\"Greystones\\\", \\\"Wicklow\\\", \\\"1\\\", \\\"2011\\\"], [\\\"Ballymo...\", \"table.name\": \"\\\"csv/204-csv/772.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-3": {"config_name": "random-split-3", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-4": {"config_name": "random-split-4", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-5": {"config_name": "random-split-5", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}}, "tags": ["task_categories:question-answering", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "table-question-answering"], "is_gated": false}, "GEM/xwikis": {"dataset_name": "GEM/xwikis", "description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.", "downloads": 141, "configs": {"en-fr": {"config_name": "en-fr", "sample_row": "{\"gem_id\": \"\\\"en-fr-train-86694\\\"\", \"gem_parent_id\": \"\\\"en-fr-train-86694\\\"\", \"id\": \"\\\"86694\\\"\", \"src_title\": \"\\\"Abstract algebra\\\"\", \"tgt_title\": \"\\\"Alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Early group theory.\\\", \\\"Modern algebr...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"As in other parts of mathematics, concrete probl...\", \"src_summary\": \"\\\"In algebra, which is a broad division of mathemat...\", \"tgt_summary\": \"\\\"L'alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale, ou alg\\\\u00e8br...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-cs": {"config_name": "en-cs", "sample_row": "{\"gem_id\": \"\\\"en-cs-train-730484\\\"\", \"gem_parent_id\": \"\\\"en-cs-train-730484\\\"\", \"id\": \"\\\"730484\\\"\", \"src_title\": \"\\\"Astronomy\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Etymology.\\\", \\\"Use of terms \\\\\\\"astronomy\\\\\\\" and \\\\\\\"a...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\\\\"Astronomy\\\\\\\" (from the Greek \\\\u1f00\\\\u03c3\\\\u03c4...\", \"src_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\", \"tgt_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-de": {"config_name": "en-de", "sample_row": "{\"gem_id\": \"\\\"en-de-train-610175\\\"\", \"gem_parent_id\": \"\\\"en-de-train-610175\\\"\", \"id\": \"\\\"610175\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Uses.\\\", \\\"Film direction.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\"]\", \"src_document.content\": \"[\\\"Before 1968, DGA rules did not permit directors ...\", \"src_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\", \"tgt_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"gem_id\": \"\\\"en-zh-train-2184703\\\"\", \"gem_parent_id\": \"\\\"en-zh-train-2184703\\\"\", \"id\": \"\\\"2184703\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Uses.\\\", \\\"Film direction.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\"]\", \"src_document.content\": \"[\\\"Before 1968, DGA rules did not permit directors ...\", \"src_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\", \"tgt_summary\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f(\\\\u82f1\\\\u8bed...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-en": {"config_name": "fr-en", "sample_row": "{\"gem_id\": \"\\\"fr-en-train-1566090\\\"\", \"gem_parent_id\": \"\\\"fr-en-train-1566090\\\"\", \"id\": \"\\\"1566090\\\"\", \"src_title\": \"\\\"Antoine Meillet\\\"\", \"tgt_title\": \"\\\"Antoine Meillet\\\"\", \"src_document.title\": \"[\\\"Biographie.\\\", \\\"\\\\u00c9tudes hom\\\\u00e9riques.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"D'origine bourbonnaise, fils d'un notaire de Ch\\\\...\", \"src_summary\": \"\\\"Paul Jules Antoine Meillet, n\\\\u00e9 le \\\\u00e0 Mou...\", \"tgt_summary\": \"\\\"Paul Jules Antoine Meillet (; 11 November 1866, M...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-cs": {"config_name": "fr-cs", "sample_row": "{\"gem_id\": \"\\\"fr-cs-train-37687\\\"\", \"gem_parent_id\": \"\\\"fr-cs-train-37687\\\"\", \"id\": \"\\\"37687\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Histoire.\\\", \\\"N\\\\u00e9olithique.\\\", \\\"Antiquit\\\\u00e9...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"L'astronomie est consid\\\\u00e9r\\\\u00e9e comme la p...\", \"src_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\", \"tgt_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-de": {"config_name": "fr-de", "sample_row": "{\"gem_id\": \"\\\"fr-de-train-1275594\\\"\", \"gem_parent_id\": \"\\\"fr-de-train-1275594\\\"\", \"id\": \"\\\"1275594\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Origine.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\"]\", \"src_document.content\": \"[\\\"Dans un pays o\\\\u00f9 l'Oscar du meilleur film es...\", \"src_summary\": \"\\\"Alan Smithee (on rencontre aussi les formes Allen...\", \"tgt_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-zh": {"config_name": "fr-zh", "sample_row": "{\"gem_id\": \"\\\"fr-zh-train-1595497\\\"\", \"gem_parent_id\": \"\\\"fr-zh-train-1595497\\\"\", \"id\": \"\\\"1595497\\\"\", \"src_title\": \"\\\"Antoine Meillet\\\"\", \"tgt_title\": \"\\\"\\\\u5b89\\\\u4e1c\\\\u5c3c\\\\u00b7\\\\u6885\\\\u8036\\\"\", \"src_document.title\": \"[\\\"Biographie.\\\", \\\"\\\\u00c9tudes hom\\\\u00e9riques.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"D'origine bourbonnaise, fils d'un notaire de Ch\\\\...\", \"src_summary\": \"\\\"Paul Jules Antoine Meillet, n\\\\u00e9 le \\\\u00e0 Mou...\", \"tgt_summary\": \"\\\"\\\\u5b89\\\\u4e1c\\\\u5c3c\\\\u00b7\\\\u6885\\\\u8036(1866\\\\u5e7411...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"gem_id\": \"\\\"cs-en-train-595004\\\"\", \"gem_parent_id\": \"\\\"cs-en-train-595004\\\"\", \"id\": \"\\\"595004\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomy\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-fr": {"config_name": "cs-fr", "sample_row": "{\"gem_id\": \"\\\"cs-fr-train-909261\\\"\", \"gem_parent_id\": \"\\\"cs-fr-train-909261\\\"\", \"id\": \"\\\"909261\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-de": {"config_name": "cs-de", "sample_row": "{\"gem_id\": \"\\\"cs-de-train-38396\\\"\", \"gem_parent_id\": \"\\\"cs-de-train-38396\\\"\", \"id\": \"\\\"38396\\\"\", \"src_title\": \"\\\"Ak\\\\u010dn\\\\u00ed film\\\"\", \"tgt_title\": \"\\\"Actionfilm\\\"\", \"src_document.title\": \"[\\\"Prvn\\\\u00ed ak\\\\u010dn\\\\u00ed filmy.\\\", \\\"N\\\\u00e1stup...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Za prvn\\\\u00ed ak\\\\u010dn\\\\u00ed sc\\\\u00e9nu b\\\\u00fd...\", \"src_summary\": \"\\\"Ak\\\\u010dn\\\\u00ed film je filmov\\\\u00fd \\\\u017e\\\\u00e1...\", \"tgt_summary\": \"\\\"Der Actionfilm (von engl. \\\\\\\"action\\\\\\\": Tat, Handlu...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-zh": {"config_name": "cs-zh", "sample_row": "{\"gem_id\": \"\\\"cs-zh-train-241305\\\"\", \"gem_parent_id\": \"\\\"cs-zh-train-241305\\\"\", \"id\": \"\\\"241305\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"\\\\u5929\\\\u6587\\\\u5b78\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"\\\\u5929\\\\u6587\\\\u5b66\\\\u662f\\\\u4e00\\\\u95e8\\\\u81ea\\\\u7136\\\\...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-en": {"config_name": "de-en", "sample_row": "{\"gem_id\": \"\\\"de-en-train-1021816\\\"\", \"gem_parent_id\": \"\\\"de-en-train-1021816\\\"\", \"id\": \"\\\"1021816\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"gem_id\": \"\\\"de-fr-train-1069456\\\"\", \"gem_parent_id\": \"\\\"de-fr-train-1069456\\\"\", \"id\": \"\\\"1069456\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"Alan Smithee (on rencontre aussi les formes Allen...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-cs": {"config_name": "de-cs", "sample_row": "{\"gem_id\": \"\\\"de-cs-train-1599234\\\"\", \"gem_parent_id\": \"\\\"de-cs-train-1599234\\\"\", \"id\": \"\\\"1599234\\\"\", \"src_title\": \"\\\"Ang Lee\\\"\", \"tgt_title\": \"\\\"Ang Lee\\\"\", \"src_document.title\": \"[\\\"Leben.\\\", \\\"Filmisches Werk.\\\", \\\"1992\\\\u20131994: Di...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Ang Lee wurde 1954 in Taiwan geboren. Seine Elte...\", \"src_summary\": \"\\\"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Landkr...\", \"tgt_summary\": \"\\\"Ang Lee (* 23. \\\\u0159\\\\u00edjna 1954, Pingtung, Tc...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-zh": {"config_name": "de-zh", "sample_row": "{\"gem_id\": \"\\\"de-zh-train-387483\\\"\", \"gem_parent_id\": \"\\\"de-zh-train-387483\\\"\", \"id\": \"\\\"387483\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"\\\\u827e\\\\u502b\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f(\\\\u82f1\\\\u8bed...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-en": {"config_name": "zh-en", "sample_row": "{\"gem_id\": \"\\\"zh-en-train-2183211\\\"\", \"gem_parent_id\": \"\\\"zh-en-train-2183211\\\"\", \"id\": \"\\\"2183211\\\"\", \"src_title\": \"\\\"\\\\u9515\\\"\", \"tgt_title\": \"\\\"Actinium\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u5c5e\\\\u6027.\\\", \\\"\\\\u5316\\\\u5408\\\\u...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]...\", \"src_document.content\": \"[\\\"\\\\u6cd5\\\\u56fd\\\\u5316\\\\u5b66\\\\u5bb6\\\\u5b89\\\\u5fb7\\\\u70c8...\", \"src_summary\": \"\\\"\\\\u9515\\\\u662f\\\\u4e00\\\\u79cd\\\\u653e\\\\u5c04\\\\u6027\\\\u91d1\\\\...\", \"tgt_summary\": \"\\\"Actinium is a chemical element with the symbol Ac...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-fr": {"config_name": "zh-fr", "sample_row": "{\"gem_id\": \"\\\"zh-fr-train-1570465\\\"\", \"gem_parent_id\": \"\\\"zh-fr-train-1570465\\\"\", \"id\": \"\\\"1570465\\\"\", \"src_title\": \"\\\"\\\\u62bd\\\\u8c61\\\\u4ee3\\\\u6570\\\"\", \"tgt_title\": \"\\\"Alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u65e9\\\\u671f\\\\u7684\\\\u7fa4\\\\u8bba....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\\u5982\\\\u540c\\\\u5176\\\\u4ed6\\\\u7684\\\\u6570\\\\u5b66\\\\u9886...\", \"src_summary\": \"\\\"\\\\u62bd\\\\u8c61\\\\u4ee3\\\\u6570\\\\u4f5c\\\\u4e3a\\\\u6570\\\\u5b66\\\\...\", \"tgt_summary\": \"\\\"L'alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale, ou alg\\\\u00e8br...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-cs": {"config_name": "zh-cs", "sample_row": "{\"gem_id\": \"\\\"zh-cs-train-291204\\\"\", \"gem_parent_id\": \"\\\"zh-cs-train-291204\\\"\", \"id\": \"\\\"291204\\\"\", \"src_title\": \"\\\"\\\\u65c5\\\\u6e38\\\"\", \"tgt_title\": \"\\\"Turistika\\\"\", \"src_document.title\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb.\\\", \\\"\\\\u65c5\\\\u6e38\\\\u7406\\\\...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb\\\\u79f0\\\\u4e3a\\\\u65c5\\\\u5ba2...\", \"src_summary\": \"\\\"\\\\u65c5\\\\u6e38\\\\u5c31\\\\u662f\\\\u65c5\\\\u884c\\\\u6e38\\\\u89c8\\\\...\", \"tgt_summary\": \"\\\"Turistika je z\\\\u00e1jmov\\\\u00e1 \\\\u010dinnost (spor...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-de": {"config_name": "zh-de", "sample_row": "{\"gem_id\": \"\\\"zh-de-train-709836\\\"\", \"gem_parent_id\": \"\\\"zh-de-train-709836\\\"\", \"id\": \"\\\"709836\\\"\", \"src_title\": \"\\\"\\\\u9515\\\"\", \"tgt_title\": \"\\\"Actinium\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u5c5e\\\\u6027.\\\", \\\"\\\\u5316\\\\u5408\\\\u...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]...\", \"src_document.content\": \"[\\\"\\\\u6cd5\\\\u56fd\\\\u5316\\\\u5b66\\\\u5bb6\\\\u5b89\\\\u5fb7\\\\u70c8...\", \"src_summary\": \"\\\"\\\\u9515\\\\u662f\\\\u4e00\\\\u79cd\\\\u653e\\\\u5c04\\\\u6027\\\\u91d1\\\\...\", \"tgt_summary\": \"\\\"Actinium ist ein radioaktives chemisches Element ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en": {"config_name": "en", "sample_row": "{\"gem_id\": \"\\\"en-train-730484\\\"\", \"gem_parent_id\": \"\\\"en-train-730484\\\"\", \"id\": \"\\\"730484\\\"\", \"src_title\": \"\\\"Astronomy\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Etymology.\\\", \\\"Use of terms \\\\\\\"astronomy\\\\\\\" and \\\\\\\"a...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\\\\"Astronomy\\\\\\\" (from the Greek \\\\u1f00\\\\u03c3\\\\u03c4...\", \"src_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr": {"config_name": "fr", "sample_row": "{\"gem_id\": \"\\\"fr-train-37687\\\"\", \"gem_parent_id\": \"\\\"fr-train-37687\\\"\", \"id\": \"\\\"37687\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Histoire.\\\", \\\"N\\\\u00e9olithique.\\\", \\\"Antiquit\\\\u00e9...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"L'astronomie est consid\\\\u00e9r\\\\u00e9e comme la p...\", \"src_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs": {"config_name": "cs", "sample_row": "{\"gem_id\": \"\\\"cs-train-909261\\\"\", \"gem_parent_id\": \"\\\"cs-train-909261\\\"\", \"id\": \"\\\"909261\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de": {"config_name": "de", "sample_row": "{\"gem_id\": \"\\\"de-train-1599234\\\"\", \"gem_parent_id\": \"\\\"de-train-1599234\\\"\", \"id\": \"\\\"1599234\\\"\", \"src_title\": \"\\\"Ang Lee\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Leben.\\\", \\\"Filmisches Werk.\\\", \\\"1992\\\\u20131994: Di...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Ang Lee wurde 1954 in Taiwan geboren. Seine Elte...\", \"src_summary\": \"\\\"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Landkr...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh": {"config_name": "zh", "sample_row": "{\"gem_id\": \"\\\"zh-train-291204\\\"\", \"gem_parent_id\": \"\\\"zh-train-291204\\\"\", \"id\": \"\\\"291204\\\"\", \"src_title\": \"\\\"\\\\u65c5\\\\u6e38\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb.\\\", \\\"\\\\u65c5\\\\u6e38\\\\u7406\\\\...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb\\\\u79f0\\\\u4e3a\\\\u65c5\\\\u5ba2...\", \"src_summary\": \"\\\"\\\\u65c5\\\\u6e38\\\\u5c31\\\\u662f\\\\u65c5\\\\u884c\\\\u6e38\\\\u89c8\\\\...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}}, "tags": ["task_categories:summarization", "annotations_creators:found", "multilinguality:unknown", "source_datasets:original", "language:de", "language:en", "language:fr", "language:cs"], "is_gated": false}, "cfilt/iwn_wordlists": {"dataset_name": "cfilt/iwn_wordlists", "description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.", "downloads": 34, "configs": {"assamese": {"config_name": "assamese", "sample_row": "{\"word\": \"\\\"\\\\u09b8\\\\u0982\\\\u099c\\\\u09cd\\\\u099e\\\\u09be\\\\u09b6\\\\u09c2\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "bengali": {"config_name": "bengali", "sample_row": "{\"word\": \"\\\"\\\\u09b8\\\\u09c0\\\\u09ae\\\\u09a8\\\\u09cd\\\\u09a4\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "bodo": {"config_name": "bodo", "sample_row": "{\"word\": \"\\\"\\\\u0928\\\\u0916\\\\u0930_\\\\u0917\\\\u0948\\\\u092f\\\\u093f\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "gujarati": {"config_name": "gujarati", "sample_row": "{\"word\": \"\\\"\\\\u0aa6\\\\u0abe\\\\u0ab5\\\\u0abe\\\\u0a97\\\\u0acd\\\\u0aa8\\\\u0abf\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "hindi": {"config_name": "hindi", "sample_row": "{\"word\": \"\\\"\\\\u0916\\\\u093e\\\\u0938 \\\\u0924\\\\u094c\\\\u0930 \\\\u0938\\\\u094...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "kannada": {"config_name": "kannada", "sample_row": "{\"word\": \"\\\"\\\\u0cae\\\\u0cc3\\\\u0ca4\\\\u0ccd\\\\u0caf\\\\u0c82\\\\u0c9c\\\\u0caf\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "kashmiri": {"config_name": "kashmiri", "sample_row": "{\"word\": \"\\\"\\\\u062c\\\\u0654\\\\u0632\\\\u06cc\\\\u0656\\\\u0631\\\\u064f\\\\u06a9_...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "konkani": {"config_name": "konkani", "sample_row": "{\"word\": \"\\\"\\\\u0939\\\\u093f\\\\u0930\\\\u0923\\\\u094d\\\\u092f\\\\u0915\\\\u0936\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "malayalam": {"config_name": "malayalam", "sample_row": "{\"word\": \"\\\"\\\\u0d05\\\\u0d7c\\\\u0d39\\\\u0d24\\\\u0d15\\\\u0d3f\\\\u0d1f\\\\u0d4d\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "manipuri": {"config_name": "manipuri", "sample_row": "{\"word\": \"\\\"mmL_yAMlb \\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "marathi": {"config_name": "marathi", "sample_row": "{\"word\": \"\\\"\\\\u0932\\\\u093e\\\\u0935\\\\u093e\\\\u0932\\\\u093e\\\\u0935\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "meitei": {"config_name": "meitei", "sample_row": "{\"word\": \"\\\"\\\\uabc3\\\\uabe6\\\\uabdb\\\\uabc1\\\\uabe4\\\\uabc0\\\\uabe3\\\\uabed\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "nepali": {"config_name": "nepali", "sample_row": "{\"word\": \"\\\"\\\\u0906\\\\u0930\\\\u094b\\\\u092a\\\\u0923\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "oriya": {"config_name": "oriya", "sample_row": "{\"word\": \"\\\"\\\\u0b2d\\\\u0b42\\\\u0b24\\\\u0b2a\\\\u0b4d\\\\u0b30\\\\u0b47\\\\u0b24\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "punjabi": {"config_name": "punjabi", "sample_row": "{\"word\": \"\\\"\\\\u0a2e\\\\u0a71\\\\u0a16\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "sanskrit": {"config_name": "sanskrit", "sample_row": "{\"word\": \"\\\"\\\\u092d\\\\u0930\\\\u0926\\\\u094d\\\\u0935\\\\u093e\\\\u091c\\\\u0903\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "tamil": {"config_name": "tamil", "sample_row": "{\"word\": \"\\\"\\\\u0ba4\\\\u0bc1\\\\u0ba9\\\\u0bcd\\\\u0ba9\\\\u0bc2\\\\u0bb1\\\\u0bc1\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "telugu": {"config_name": "telugu", "sample_row": "{\"word\": \"\\\"\\\\u0c28\\\\u0c2e\\\\u0c4d\\\\u0c2e\\\\u0c26\\\\u0c17\\\\u0c3f\\\\u0c28\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "urdu": {"config_name": "urdu", "sample_row": "{\"word\": \"\\\"\\\\u0646\\\\u0627\\\\u06af\\\\u06cc\\\\u0634\\\\u0648\\\\u0631\\\\u060c\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}}, "tags": ["task_categories:token-classification", "annotations_creators:Shivam Mhaskar, Diptesh Kanojia", "multilinguality:monolingual", "source_datasets:original", "language:as", "language:bn", "language:mni", "language:gu", "language:hi", "language:kn", "language:ks", "language:kok", "language:ml", "language:mr", "language:or", "language:ne", "language:pa", "language:sa", "language:ta", "language:te", "language:ur", "abbreviation-detection"], "is_gated": false}, "SocialGrep/the-reddit-place-dataset": {"dataset_name": "SocialGrep/the-reddit-place-dataset", "description": "The written history or /r/Place, in posts and comments.", "downloads": 23, "configs": {"posts": {"config_name": "posts", "sample_row": "{\"type\": \"\\\"post\\\"\", \"id\": \"\\\"twh9v4\\\"\", \"subreddit.id\": \"\\\"2sxhs\\\"\", \"subreddit.name\": \"\\\"place\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1649116799\", \"permalink\": \"\\\"https://old.reddit.com/r/place/comments/twh9v4/is...\", \"domain\": \"\\\"i.redd.it\\\"\", \"url\": \"\\\"https://i.redd.it/0kyey4qeplr81.jpg\\\"\", \"selftext\": \"\\\"\\\"\", \"title\": \"\\\"Is this a glitch? What is up with r/place?\\\"\", \"score\": \"8\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "domain", "url", "selftext", "title", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "domain": "domain", "url": "url", "selftext": "selftext", "title": "title", "score": "score"}, "dataset_description": "The written history or /r/Place, in posts and comments.\n", "dataset_name": "SocialGrep/the-reddit-place-dataset"}, "comments": {"config_name": "comments", "sample_row": "{\"type\": \"1\", \"id\": \"\\\"i3f9n12\\\"\", \"subreddit.id\": \"\\\"2sxhs\\\"\", \"subreddit.name\": \"\\\"place\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1649116799\", \"permalink\": \"\\\"https://old.reddit.com/r/place/comments/twdn7y/sp...\", \"body\": \"\\\"[removed]\\\"\", \"sentiment\": \"null\", \"score\": \"1\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "body", "sentiment", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "body": "body", "sentiment": "sentiment", "score": "score"}, "dataset_description": "The written history or /r/Place, in posts and comments.\n", "dataset_name": "SocialGrep/the-reddit-place-dataset"}}, "tags": ["annotations_creators:lexyr", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "StanBienaives/french-open-fiscal-texts": {"dataset_name": "StanBienaives/french-open-fiscal-texts", "description": " This dataset is an extraction from the OPENDATA/JADE. A list of case laws from the French court \"Conseil d'Etat\".", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"CAA de PARIS, 9\\\\u00e8me chambre, 08/02/2018, 17PA...\", \"content\": \"\\\"Vu la proc\\\\u00e9dure suivante :\\\\n\\\\n Proc\\\\u0...\", \"summary\": \"\\\"\\\"\", \"solution\": \"\\\"\\\"\", \"numero\": \"\\\"17PA01570\\\"\", \"publi_receuil\": \"\\\"C\\\"\", \"date\": \"\\\"2018-02-08\\\"\"}", "columns": ["title", "content", "summary", "solution", "numero", "publi_receuil", "date"], "columns_mapping": {"title": "title", "content": "content", "summary": "summary", "solution": "solution", "numero": "numero", "publi_receuil": "publi_receuil", "date": "date"}, "dataset_description": " This dataset is an extraction from the OPENDATA/JADE. A list of case laws from the French court \"Conseil d'Etat\".\n", "dataset_name": "StanBienaives/french-open-fiscal-texts"}}, "tags": ["task_categories:summarization", "task_categories:feature-extraction", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original"], "is_gated": false}, "McGill-NLP/TopiOCQA": {"dataset_name": "McGill-NLP/TopiOCQA", "description": "TopiOCQA is an information-seeking conversational dataset with challenging topic switching phenomena.", "downloads": 304, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"Conversation_no\": \"1\", \"Turn_no\": \"1\", \"Question\": \"\\\"what was australia's contribution to the battle o...\", \"Answer\": \"\\\"The army personnel and thousands of Australian ai...\", \"Topic\": \"\\\"Australian contribution to the Battle of Normandy...\", \"Topic_section\": \"\\\"Introduction\\\"\", \"Rationale\": \"\\\" The army personnel and thousands of Australian a...\", \"is_nq\": \"false\", \"Context\": \"[]\", \"Additional_answers.Answer\": \"[]\", \"Additional_answers.Topic\": \"[]\", \"Additional_answers.Topic_section\": \"[]\", \"Additional_answers.Rationale\": \"[]\", \"Gold_passage.id\": \"\\\"wiki:5498209\\\"\", \"Gold_passage.title\": \"\\\"Australian contribution to the Battle of Normandy...\", \"Gold_passage.text\": \"\\\"Australian personnel also took part in the invasi...\"}", "columns": ["Conversation_no", "Turn_no", "Question", "Answer", "Topic", "Topic_section", "Rationale", "is_nq", "Context", "Additional_answers_Answer", "Additional_answers_Topic", "Additional_answers_Topic_section", "Additional_answers_Rationale", "Gold_passage_id", "Gold_passage_title", "Gold_passage_text"], "columns_mapping": {"Conversation_no": "Conversation_no", "Turn_no": "Turn_no", "Question": "Question", "Answer": "Answer", "Topic": "Topic", "Topic_section": "Topic_section", "Rationale": "Rationale", "is_nq": "is_nq", "Context": "Context", "Additional_answers.Answer": "Additional_answers_Answer", "Additional_answers.Topic": "Additional_answers_Topic", "Additional_answers.Topic_section": "Additional_answers_Topic_section", "Additional_answers.Rationale": "Additional_answers_Rationale", "Gold_passage.id": "Gold_passage_id", "Gold_passage.title": "Gold_passage_title", "Gold_passage.text": "Gold_passage_text"}, "dataset_description": "TopiOCQA is an information-seeking conversational dataset with challenging topic switching phenomena.\n", "dataset_name": "McGill-NLP/TopiOCQA"}}, "tags": ["task_categories:text-retrieval", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "conversational-question-answering"], "is_gated": false}, "taln-ls2n/inspec": {"dataset_name": "taln-ls2n/inspec", "description": "Inspec benchmark dataset for keyphrase extraction an generation.", "downloads": 71, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"761\\\"\", \"title\": \"\\\"Towards a NMR implementation of a quantum lattice...\", \"abstract\": \"\\\"Recent theoretical results suggest that an array ...\", \"keyphrases\": \"[\\\"NMR implementation\\\", \\\"quantum lattice gas algori...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "Inspec benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/inspec"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "taln-ls2n/kp20k": {"dataset_name": "taln-ls2n/kp20k", "description": "KP20k dataset for keyphrase extraction and generation in scientific paper.", "downloads": 32, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"vXFe8Vy\\\"\", \"title\": \"\\\"virtually enhancing the perception of user action...\", \"abstract\": \"\\\"This paper proposes using virtual reality to enha...\", \"keyphrases\": \"[\\\"animation\\\", \\\"avatars\\\", \\\"telepresence\\\", \\\"applicat...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"R\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KP20k dataset for keyphrase extraction and generation in scientific paper.\n", "dataset_name": "taln-ls2n/kp20k"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en", "keyphrase-generation", "keyphrase-extraction", "text-mining"], "is_gated": false}, "conceptual_captions": {"dataset_name": "conceptual_captions", "description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.", "downloads": 2447, "configs": {"unlabeled": {"config_name": "unlabeled", "sample_row": "{\"image_url\": \"\\\"http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/AAA...\", \"caption\": \"\\\"a very typical bus station\\\"\"}", "columns": ["image_url", "caption"], "columns_mapping": {"image_url": "image_url", "caption": "caption"}, "dataset_description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.\n", "dataset_name": "conceptual_captions"}, "labeled": {"config_name": "labeled", "sample_row": "{\"image_url\": \"\\\"https://thumb1.shutterstock.com/display_pic_with_...\", \"caption\": \"\\\"christmas tree on a black background .\\\"\", \"labels\": \"[\\\"christmas tree\\\", \\\"christmas decoration\\\", \\\"font\\\",...\", \"MIDs\": \"[\\\"/m/025nd\\\", \\\"/m/05fc9mj\\\", \\\"/m/03gq5hm\\\", \\\"/m/07s6n...\", \"confidence_scores\": \"[0.9818305373191833, 0.952756941318512, 0.92273795...\"}", "columns": ["image_url", "caption", "labels", "MIDs", "confidence_scores"], "columns_mapping": {"image_url": "image_url", "caption": "caption", "labels": "labels", "MIDs": "MIDs", "confidence_scores": "confidence_scores"}, "dataset_description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.\n", "dataset_name": "conceptual_captions"}}, "tags": ["task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "conceptual_12m": {"dataset_name": "conceptual_12m", "description": "Conceptual 12M is a large-scale dataset of 12 million\nimage-text pairs specifically meant to be used for visionand-language pre-training.\nIts data collection pipeline is a relaxed version of the one used in Conceptual Captions 3M.", "downloads": 134, "configs": {"default": {"config_name": "default", "sample_row": "{\"image_url\": \"\\\"https://chairish-prod.freetls.fastly.net/image/pr...\", \"caption\": \"\\\"Metal Design Within Reach Ivory Slipper Chairs - ...\"}", "columns": ["image_url", "caption"], "columns_mapping": {"image_url": "image_url", "caption": "caption"}, "dataset_description": "Conceptual 12M is a large-scale dataset of 12 million\nimage-text pairs specifically meant to be used for visionand-language pre-training.\nIts data collection pipeline is a relaxed version of the one used in Conceptual Captions 3M.\n", "dataset_name": "conceptual_12m"}}, "tags": ["task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "surrey-nlp/PLOD-filtered": {"dataset_name": "surrey-nlp/PLOD-filtered", "description": "This is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.", "downloads": 44, "configs": {"PLODfiltered": {"config_name": "PLODfiltered", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Alternatively\\\", \\\",\\\", \\\"fibroblasts\\\", \\\"were\\\", \\\"pla...\", \"pos_tags\": \"[2, 13, 8, 3, 16, 2, 14, 14, 11, 3, 10, 16, 6, 0, ...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.\n", "dataset_name": "surrey-nlp/PLOD-filtered"}}, "tags": ["task_categories:token-classification", "annotations_creators:Leonardo Zilio, Hadeel Saadany, Prashant Sharma, Diptesh Kanojia, Constantin Orasan", "multilinguality:monolingual", "source_datasets:original", "language:en", "abbreviation-detection"], "is_gated": false}, "Divyanshu/indicxnli": {"dataset_name": "Divyanshu/indicxnli", "description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "downloads": 682, "configs": {"hi": {"config_name": "hi", "sample_row": "{\"premise\": \"\\\"\\\\u0905\\\\u0935\\\\u0927\\\\u093e\\\\u0930\\\\u0923\\\\u093e\\\\u0924\\\\...\", \"hypothesis\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u092a\\\\u093e\\\\u0926 \\\\u0914\\\\u0930...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "bn": {"config_name": "bn", "sample_row": "{\"premise\": \"\\\"\\\\u09a7\\\\u09be\\\\u09b0\\\\u09a3\\\\u09be\\\\u0997\\\\u09a4\\\\u09ad\\\\...\", \"hypothesis\": \"\\\"\\\\u09aa\\\\u09a3\\\\u09cd\\\\u09af \\\\u098f\\\\u09ac\\\\u0982 \\\\u09a...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "mr": {"config_name": "mr", "sample_row": "{\"premise\": \"\\\"\\\\u0938\\\\u0902\\\\u0915\\\\u0932\\\\u094d\\\\u092a\\\\u0928\\\\u093e\\\\...\", \"hypothesis\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u092a\\\\u093e\\\\u0926\\\\u0928 \\\\u0906...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "as": {"config_name": "as", "sample_row": "{\"premise\": \"\\\"\\\\u09ad\\\\u09be\\\\u09f1\\\\u09bf\\\\u0995\\\\u09ad\\\\u09be\\\\u09ac\\\\...\", \"hypothesis\": \"\\\"\\\\u0989\\\\u09ce\\\\u09aa\\\\u09be\\\\u09a6\\\\u09bf\\\\u09a4 \\\\u09ac...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "ta": {"config_name": "ta", "sample_row": "{\"premise\": \"\\\"\\\\u0b95\\\\u0bb0\\\\u0bc1\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bbf\\\\u0baf\\\\...\", \"hypothesis\": \"\\\"\\\\u0ba4\\\\u0baf\\\\u0bbe\\\\u0bb0\\\\u0bbf\\\\u0baa\\\\u0bcd\\\\u0baa\\\\...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "te": {"config_name": "te", "sample_row": "{\"premise\": \"\\\"\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c40\\\\u0c2e\\\\u0c4d \\\\u0c38\\\\u0c4d...\", \"hypothesis\": \"\\\"\\\\u0c09\\\\u0c24\\\\u0c4d\\\\u0c2a\\\\u0c24\\\\u0c4d\\\\u0c24\\\\u0c3f ...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "or": {"config_name": "or", "sample_row": "{\"premise\": \"\\\"\\\\u0b15\\\\u0b4d\\\\u0b30\\\\u0b3f\\\\u0b2e \\\\u0b38\\\\u0b4d\\\\u0b15...\", \"hypothesis\": \"\\\"\\\\u0b09\\\\u0b24\\\\u0b4d\\\\u0b2a\\\\u0b3e\\\\u0b26 \\\\u0b0f\\\\u0b2c...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "ml": {"config_name": "ml", "sample_row": "{\"premise\": \"\\\"\\\\u0d06\\\\u0d36\\\\u0d2f\\\\u0d2a\\\\u0d30\\\\u0d2e\\\\u0d3e\\\\u0d2f\\\\...\", \"hypothesis\": \"\\\"\\\\u0d09\\\\u0d7d\\\\u0d2a\\\\u0d4d\\\\u0d2a\\\\u0d28\\\\u0d4d\\\\u0d28\\\\...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "pa": {"config_name": "pa", "sample_row": "{\"premise\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a27\\\\u0a3e\\\\u0a02\\\\u0a24\\\\u0a15 \\\\u0a24...\", \"hypothesis\": \"\\\"\\\\u0a09\\\\u0a24\\\\u0a2a\\\\u0a3e\\\\u0a26 \\\\u0a05\\\\u0a24\\\\u0a47...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "gu": {"config_name": "gu", "sample_row": "{\"premise\": \"\\\"\\\\u0ab5\\\\u0abf\\\\u0aad\\\\u0abe\\\\u0ab5\\\\u0aa8\\\\u0abe\\\\u0aa4\\\\...\", \"hypothesis\": \"\\\"\\\\u0a89\\\\u0aa4\\\\u0acd\\\\u0aaa\\\\u0abe\\\\u0aa6\\\\u0aa8 \\\\u0a85...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "kn": {"config_name": "kn", "sample_row": "{\"premise\": \"\\\"\\\\u0caa\\\\u0cb0\\\\u0cbf\\\\u0c95\\\\u0cb2\\\\u0ccd\\\\u0caa\\\\u0ca8\\\\...\", \"hypothesis\": \"\\\"\\\\u0c89\\\\u0ca4\\\\u0ccd\\\\u0caa\\\\u0ca8\\\\u0ccd\\\\u0ca8 \\\\u0cae...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "Yaxin/SemEval2016Task5Raw": {"dataset_name": "Yaxin/SemEval2016Task5Raw", "description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.", "downloads": 64, "configs": {"All": {"config_name": "All", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_english": {"config_name": "restaurants_english", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_french": {"config_name": "restaurants_french", "sample_row": "{\"text\": \"\\\"Un service passable .. Des plats surcuits, des sa...\", \"opinions\": \"[{\\\"target\\\": \\\"service\\\", \\\"category\\\": \\\"SERVICE#GENERA...\", \"language\": \"\\\"french\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"g1079435-d3498474-r271346275\\\"\", \"sentenceId\": \"\\\"g1079435-d3498474-r271346275:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_spanish": {"config_name": "restaurants_spanish", "sample_row": "{\"text\": \"\\\"Nos sentimos muy a gusto.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"NULL\\\", \\\"category\\\": \\\"RESTAURANT#GENERA...\", \"language\": \"\\\"spanish\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"es_9reinas_10_JordiCollGranell_2014-09-21\\\"\", \"sentenceId\": \"\\\"es_9reinas_10_JordiCollGranell_2014-09-21:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_russian": {"config_name": "restaurants_russian", "sample_row": "{\"text\": \"\\\"\\\\u0414\\\\u043e\\\\u0431\\\\u0440\\\\u044b\\\\u0439 \\\\u0447\\\\u0430...\", \"opinions\": \"[]\", \"language\": \"\\\"russian\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"27925\\\"\", \"sentenceId\": \"\\\"27925:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_dutch": {"config_name": "restaurants_dutch", "sample_row": "{\"text\": \"\\\"Lange wachttijd.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"wachttijd\\\", \\\"category\\\": \\\"SERVICE#GENE...\", \"language\": \"\\\"dutch\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"Review-g1006565-d2066794_1\\\"\", \"sentenceId\": \"\\\"Review-g1006565-d2066794_1:1\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_turkish": {"config_name": "restaurants_turkish", "sample_row": "{\"text\": \"\\\"Manzara sahane evet ama servis rezalet.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"servis\\\", \\\"category\\\": \\\"SERVICE#GENERAL...\", \"language\": \"\\\"turkish\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1000\\\"\", \"sentenceId\": \"\\\"1000:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "hotels_arabic": {"config_name": "hotels_arabic", "sample_row": "{\"text\": \"\\\"\\\\u0623\\\\u0646\\\\u0635\\\\u062d \\\\u0628\\\\u0627\\\\u0644\\\\u0646...\", \"opinions\": \"[{\\\"target\\\": \\\"\\\\u0645\\\\u0648\\\\u0642\\\\u0639\\\", \\\"category\\\"...\", \"language\": \"\\\"arabic\\\"\", \"domain\": \"\\\"hotels\\\"\", \"reviewId\": \"\\\"456\\\"\", \"sentenceId\": \"\\\"456:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "mobilephones_dutch": {"config_name": "mobilephones_dutch", "sample_row": "{\"text\": \"\\\"Ik zou deze gsm ten sterkste aanbevelen ik was la...\", \"opinions\": \"[{\\\"target\\\": \\\"\\\", \\\"category\\\": \\\"BATTERY#OPERATION_PER...\", \"language\": \"\\\"dutch\\\"\", \"domain\": \"\\\"mobilephones\\\"\", \"reviewId\": \"\\\"Huawei_Ascend_G6_4G_4\\\"\", \"sentenceId\": \"\\\"Huawei_Ascend_G6_4G_4:1\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "mobilephones_chinese": {"config_name": "mobilephones_chinese", "sample_row": "{\"text\": \"\\\"\\\\u4eca\\\\u5929\\\\u6709\\\\u5e78\\\\u62ff\\\\u5230\\\\u4e86\\\\u6e2f\\\\...\", \"opinions\": \"[]\", \"language\": \"\\\"chinese\\\"\", \"domain\": \"\\\"mobilephones\\\"\", \"reviewId\": \"\\\"1\\\"\", \"sentenceId\": \"\\\"1:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "laptops_english": {"config_name": "laptops_english", "sample_row": "{\"text\": \"\\\"Being a PC user my whole life....\\\"\", \"opinions\": \"[]\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"laptops\\\"\", \"reviewId\": \"\\\"79\\\"\", \"sentenceId\": \"\\\"79:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "digitalcameras_chinese": {"config_name": "digitalcameras_chinese", "sample_row": "{\"text\": \"\\\"\\\\u5343\\\\u547c\\\\u4e07\\\\u5524\\\\u59cb\\\\u51fa\\\\u6765\\\\uff0c\\\"...\", \"opinions\": \"[]\", \"language\": \"\\\"chinese\\\"\", \"domain\": \"\\\"digitalcameras\\\"\", \"reviewId\": \"\\\"1\\\"\", \"sentenceId\": \"\\\"1:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}}, "tags": [], "is_gated": false}, "Yaxin/SemEval2015Task12Raw": {"dataset_name": "Yaxin/SemEval2015Task12Raw", "description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.", "downloads": 14, "configs": {"All": {"config_name": "All", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}, "restaurants": {"config_name": "restaurants", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}, "laptops": {"config_name": "laptops", "sample_row": "{\"text\": \"\\\"Being a PC user my whole life....\\\"\", \"opinions\": \"[]\", \"domain\": \"\\\"laptops\\\"\", \"reviewId\": \"\\\"79\\\"\", \"sentenceId\": \"\\\"79:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}}, "tags": [], "is_gated": false}, "cfilt/HiNER-collapsed": {"dataset_name": "cfilt/HiNER-collapsed", "description": "This is the repository for HiNER - a large Hindi Named Entity Recognition dataset.", "downloads": 61, "configs": {"HiNER-Collapsed": {"config_name": "HiNER-Collapsed", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0907\\\\u0938\\\", \\\"\\\\u0915\\\\u093c\\\\u093e\\\\u0928\\\\u0942\\\\u...\", \"ner_tags\": \"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the repository for HiNER - a large Hindi Named Entity Recognition dataset.\n", "dataset_name": "cfilt/HiNER-collapsed"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:hi"], "is_gated": false}, "taln-ls2n/semeval-2010-pre": {"dataset_name": "taln-ls2n/semeval-2010-pre", "description": "Preprocessed SemEval-2010 Benchmark dataset for Keyphrase Generation.", "downloads": 38, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"J-39\\\"\", \"title\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"abstract\": \"\\\"Bidders on eBay have no dominant bidding strategy...\", \"keyphrases\": \"[\\\"sequenti auction problem\\\", \\\"empir analysi\\\", \\\"bid...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"U\\\", \\\"M\\\",...\", \"lvl-1\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-2\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-3\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-4\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu", "lvl-1", "lvl-2", "lvl-3", "lvl-4"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu", "lvl-1": "lvl-1", "lvl-2": "lvl-2", "lvl-3": "lvl-3", "lvl-4": "lvl-4"}, "dataset_description": "Preprocessed SemEval-2010 Benchmark dataset for Keyphrase Generation.\n", "dataset_name": "taln-ls2n/semeval-2010-pre"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "McGill-NLP/FaithDial": {"dataset_name": "McGill-NLP/FaithDial", "description": "FaithDial is a new benchmark for hallucination-free dialogues, created by manually editing hallucinated and uncooperative responses in Wizard of Wikipedia.", "downloads": 929, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"dialog_idx\": \"0\", \"response\": \"\\\"Yeah, but once the access to the internet was a r...\", \"original_response\": \"\\\"No I could not! I couldn't imagine living when in...\", \"history\": \"[\\\"Can you imagine the world without internet acces...\", \"knowledge\": \"\\\"Internet access was once rare, but has grown rapi...\", \"BEGIN\": \"[\\\"Hallucination\\\"]\", \"VRM\": \"[\\\"Disclosure\\\", \\\"Ack.\\\"]\"}", "columns": ["dialog_idx", "response", "original_response", "history", "knowledge", "BEGIN", "VRM"], "columns_mapping": {"dialog_idx": "dialog_idx", "response": "response", "original_response": "original_response", "history": "history", "knowledge": "knowledge", "BEGIN": "BEGIN", "VRM": "VRM"}, "dataset_description": "FaithDial is a new benchmark for hallucination-free dialogues, created by manually editing hallucinated and uncooperative responses in Wizard of Wikipedia.\n", "dataset_name": "McGill-NLP/FaithDial"}}, "tags": ["task_categories:conversational", "task_categories:text-generation", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "faithful-dialogue-modeling", "trustworthy-dialogue-modeling"], "is_gated": false}, "cfilt/HiNER-original": {"dataset_name": "cfilt/HiNER-original", "description": "This is the dataset repository for HiNER Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Named Entity Recognitin for the Hindi language.", "downloads": 266, "configs": {"HiNER": {"config_name": "HiNER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0907\\\\u0938\\\", \\\"\\\\u0915\\\\u093c\\\\u093e\\\\u0928\\\\u0942\\\\u...\", \"ner_tags\": \"[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the dataset repository for HiNER Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Named Entity Recognitin for the Hindi language.\n", "dataset_name": "cfilt/HiNER-original"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:hi"], "is_gated": false}, "AmazonScience/massive": {"dataset_name": "AmazonScience/massive", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "downloads": 9770, "configs": {"af-ZA": {"config_name": "af-ZA", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"af-ZA\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"maak my wakker nege-uur v. m. op vrydag\\\"\", \"annot_utt\": \"\\\"maak my wakker [time : nege-uur v. m.] op [date :...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"40\\\", \\\"49\\\", \\\"20\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "am-ET": {"config_name": "am-ET", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"am-ET\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u12a0\\\\u122d\\\\u1265 \\\\u12d8\\\\u1320\\\\u129d \\\\u12a4. \\\\u1...\", \"annot_utt\": \"\\\"[date : \\\\u12a0\\\\u122d\\\\u1265] [time : \\\\u12d8\\\\u1320\\\\...\", \"worker_id\": \"\\\"18\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"44\\\", \\\"20\\\", \\\"55\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target|english\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ar-SA": {"config_name": "ar-SA", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ar-SA\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0635\\\\u062d\\\\u064a\\\\u0646\\\\u064a \\\\u062a\\\\u0633\\\\u0639...\", \"annot_utt\": \"\\\"\\\\u0635\\\\u062d\\\\u064a\\\\u0646\\\\u064a [time : \\\\u062a\\\\u06...\", \"worker_id\": \"\\\"31\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"31\\\", \\\"19\\\", \\\"20\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "az-AZ": {"config_name": "az-AZ", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"az-AZ\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"c\\\\u00fcm\\\\u0259 g\\\\u00fcn\\\\u00fc s\\\\u0259h\\\\u0259r saa...\", \"annot_utt\": \"\\\"[date : c\\\\u00fcm\\\\u0259 g\\\\u00fcn\\\\u00fc] [time : s\\\\...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"14\\\", \\\"29\\\", \\\"7\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "bn-BD": {"config_name": "bn-BD", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"bn-BD\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0986\\\\u09ae\\\\u09be\\\\u0995\\\\u09c7 \\\\u09b6\\\\u09c1\\\\u0995...\", \"annot_utt\": \"\\\"\\\\u0986\\\\u09ae\\\\u09be\\\\u0995\\\\u09c7 [date : \\\\u09b6\\\\u09...\", \"worker_id\": \"\\\"19\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"0\\\", \\\"12\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ca-ES": {"config_name": "ca-ES", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ca-ES\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"desperta'm a les nou a. m. del divendres\\\"\", \"annot_utt\": \"\\\"desperta'm a les [time : nou a. m.] del [date : d...\", \"worker_id\": \"\\\"42\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"42\\\", \\\"30\\\", \\\"3\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target|english\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "cy-GB": {"config_name": "cy-GB", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"cy-GB\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"deffra fi am naw y bore ar dydd gwener\\\"\", \"annot_utt\": \"\\\"deffra fi am [time : naw y bore] ar [date : dydd ...\", \"worker_id\": \"\\\"8\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"4\\\", \\\"1\\\", \\\"5\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "da-DK": {"config_name": "da-DK", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"da-DK\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"v\\\\u00e6k mig klokken ni fredag\\\"\", \"annot_utt\": \"\\\"v\\\\u00e6k mig klokken [time : ni] [date : fredag]\\\"...\", \"worker_id\": \"\\\"6\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"19\\\", \\\"6\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "de-DE": {"config_name": "de-DE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"de-DE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"weck mich am freitag um neun uhr auf\\\"\", \"annot_utt\": \"\\\"weck mich am [date : freitag] um [time : neun uhr...\", \"worker_id\": \"\\\"18\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"28\\\", \\\"8\\\", \\\"18\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "el-GR": {"config_name": "el-GR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"el-GR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u03be\\\\u03cd\\\\u03c0\\\\u03bd\\\\u03b1 \\\\u03bc\\\\u03b5 \\\\u03c...\", \"annot_utt\": \"\\\"\\\\u03be\\\\u03cd\\\\u03c0\\\\u03bd\\\\u03b1 \\\\u03bc\\\\u03b5 \\\\u03c...\", \"worker_id\": \"\\\"30\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"28\\\", \\\"68\\\", \\\"23\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "en-US": {"config_name": "en-US", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"en-US\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"wake me up at nine am on friday\\\"\", \"annot_utt\": \"\\\"wake me up at [time : nine am] on [date : friday]...\", \"worker_id\": \"\\\"1\\\"\", \"slot_method.slot\": \"[]\", \"slot_method.method\": \"[]\", \"judgments.worker_id\": \"[]\", \"judgments.intent_score\": \"[]\", \"judgments.slots_score\": \"[]\", \"judgments.grammar_score\": \"[]\", \"judgments.spelling_score\": \"[]\", \"judgments.language_identification\": \"[]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "es-ES": {"config_name": "es-ES", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"es-ES\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"despi\\\\u00e9rtame a las nueve de la ma\\\\u00f1ana el...\", \"annot_utt\": \"\\\"despi\\\\u00e9rtame a las [time : nueve de la ma\\\\u00...\", \"worker_id\": \"\\\"5\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"21\\\", \\\"5\\\", \\\"3\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fa-IR": {"config_name": "fa-IR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fa-IR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0645\\\\u0631\\\\u0627 \\\\u062c\\\\u0645\\\\u0639\\\\u0647 \\\\u063...\", \"annot_utt\": \"\\\"\\\\u0645\\\\u0631\\\\u0627 [date : \\\\u062c\\\\u0645\\\\u0639\\\\u06...\", \"worker_id\": \"\\\"3\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"8\\\", \\\"14\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fi-FI": {"config_name": "fi-FI", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fi-FI\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"her\\\\u00e4t\\\\u00e4 minut aamuyhdeks\\\\u00e4lt\\\\u00e4 p...\", \"annot_utt\": \"\\\"her\\\\u00e4t\\\\u00e4 minut [time : aamuyhdeks\\\\u00e4lt...\", \"worker_id\": \"\\\"17\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"17\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fr-FR": {"config_name": "fr-FR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fr-FR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"r\\\\u00e9veille-moi \\\\u00e0 neuf heures du matin le ...\", \"annot_utt\": \"\\\"r\\\\u00e9veille-moi \\\\u00e0 [time : neuf heures du m...\", \"worker_id\": \"\\\"22\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"22\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[2, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "he-IL": {"config_name": "he-IL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"he-IL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u05db\\\\u05d5\\\\u05d5\\\\u05df \\\\u05d0\\\\u05ea \\\\u05d4\\\\u05e...\", \"annot_utt\": \"\\\"\\\\u05db\\\\u05d5\\\\u05d5\\\\u05df \\\\u05d0\\\\u05ea \\\\u05d4\\\\u05e...\", \"worker_id\": \"\\\"29\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"3\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[2, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hi-IN": {"config_name": "hi-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hi-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0936\\\\u0941\\\\u0915\\\\u094d\\\\u0930\\\\u0935\\\\u093e\\\\u0930 ...\", \"annot_utt\": \"\\\"[date : \\\\u0936\\\\u0941\\\\u0915\\\\u094d\\\\u0930\\\\u0935\\\\u093...\", \"worker_id\": \"\\\"45\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"3\\\", \\\"42\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hu-HU": {"config_name": "hu-HU", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hu-HU\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u00e9bressz fel reggel kilenckor p\\\\u00e9nteken\\\"...\", \"annot_utt\": \"\\\"\\\\u00e9bressz fel [time : reggel kilenckor] [date ...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"12\\\", \\\"28\\\", \\\"31\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hy-AM": {"config_name": "hy-AM", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hy-AM\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0561\\\\u0580\\\\u0569\\\\u0576\\\\u0561\\\\u0581\\\\u0580\\\\u0578\\\\...\", \"annot_utt\": \"\\\"\\\\u0561\\\\u0580\\\\u0569\\\\u0576\\\\u0561\\\\u0581\\\\u0580\\\\u0578\\\\...\", \"worker_id\": \"\\\"39\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"37\\\", \\\"11\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "id-ID": {"config_name": "id-ID", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"id-ID\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"bagunkan saya jam sembilan pagi hari jumat\\\"\", \"annot_utt\": \"\\\"bagunkan saya [time : jam sembilan pagi] hari [da...\", \"worker_id\": \"\\\"21\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"7\\\", \\\"15\\\", \\\"9\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "is-IS": {"config_name": "is-IS", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"is-IS\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"vekja mig klukkan n\\\\u00edu a\\\\u00f0 morgni \\\\u00e1 ...\", \"annot_utt\": \"\\\"vekja mig klukkan [time : n\\\\u00edu a\\\\u00f0 morgni...\", \"worker_id\": \"\\\"8\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"20\\\", \\\"21\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "it-IT": {"config_name": "it-IT", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"it-IT\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"svegliami alle nove di mattina venerd\\\\u00ec\\\"\", \"annot_utt\": \"\\\"svegliami alle [time : nove] di mattina [date : v...\", \"worker_id\": \"\\\"34\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"40\\\", \\\"18\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ja-JP": {"config_name": "ja-JP", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ja-JP\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u91d1\\\\u66dc\\\\u65e5\\\\u306e\\\\u5348\\\\u524d\\\\u4e5d\\\\u6642\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u91d1\\\\u66dc\\\\u65e5] \\\\u306e [time : \\\\u5348...\", \"worker_id\": \"\\\"3\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"8\\\", \\\"5\\\", \\\"16\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "jv-ID": {"config_name": "jv-ID", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"jv-ID\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"gugah aku jam sanga esuk dina jumat\\\"\", \"annot_utt\": \"\\\"gugah aku jam [time : sanga esuk] dina [date : ju...\", \"worker_id\": \"\\\"9\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"10\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ka-GE": {"config_name": "ka-GE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ka-GE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u10d3\\\\u10d8\\\\u10da\\\\u10d8\\\\u10e1 \\\\u10ea\\\\u10ee\\\\u10e0...\", \"annot_utt\": \"\\\"[time : \\\\u10d3\\\\u10d8\\\\u10da\\\\u10d8\\\\u10e1 \\\\u10ea\\\\u10...\", \"worker_id\": \"\\\"42\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"38\\\", \\\"45\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[0, 0, 0]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[2, 1, 1]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "km-KH": {"config_name": "km-KH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"km-KH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u1798\\\\u17c9\\\\u17c4\\\\u1784 \\\\u1794\\\\u17d2\\\\u179a\\\\u17b6...\", \"annot_utt\": \"\\\"\\\\u1798\\\\u17c9\\\\u17c4\\\\u1784 [time : \\\\u1794\\\\u17d2\\\\u17...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"10\\\", \\\"29\\\", \\\"30\\\"]\", \"judgments.intent_score\": \"[1, 0, 1]\", \"judgments.slots_score\": \"[1, 2, 2]\", \"judgments.grammar_score\": \"[3, 0, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "kn-IN": {"config_name": "kn-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"kn-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0cb6\\\\u0cc1\\\\u0c95\\\\u0ccd\\\\u0cb0\\\\u0cb5\\\\u0cbe\\\\u0cb0 ...\", \"annot_utt\": \"\\\"[date : \\\\u0cb6\\\\u0cc1\\\\u0c95\\\\u0ccd\\\\u0cb0\\\\u0cb5\\\\u0cb...\", \"worker_id\": \"\\\"7\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"14\\\", \\\"7\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ko-KR": {"config_name": "ko-KR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ko-KR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\uae08\\\\uc694\\\\uc77c \\\\uc624\\\\uc804 \\\\uc544\\\\ud649 \\\\uc2...\", \"annot_utt\": \"\\\"[date : \\\\uae08\\\\uc694\\\\uc77c] [time : \\\\uc624\\\\uc804 ...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"23\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "lv-LV": {"config_name": "lv-LV", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"lv-LV\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"piektdien pamodini mani devi\\\\u0146os no r\\\\u012bta...\", \"annot_utt\": \"\\\"[date : piektdien] pamodini mani [time : devi\\\\u01...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"22\\\", \\\"9\\\", \\\"15\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ml-IN": {"config_name": "ml-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ml-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0d35\\\\u0d46\\\\u0d33\\\\u0d4d\\\\u0d33\\\\u0d3f\\\\u0d2f\\\\u0d3e\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0d35\\\\u0d46\\\\u0d33\\\\u0d4d\\\\u0d33\\\\u0d3f\\\\u0d2...\", \"worker_id\": \"\\\"26\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"26\\\", \\\"23\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "mn-MN": {"config_name": "mn-MN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ms-MY": {"config_name": "ms-MY", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ms-MY\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"kejutkan saya pada pukul sembilan pagi hari jumaa...\", \"annot_utt\": \"\\\"kejutkan saya pada pukul [time : sembilan pagi] h...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"9\\\", \\\"5\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "my-MM": {"config_name": "my-MM", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"my-MM\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u1004\\\\u102b\\\\u1037\\\\u1000\\\\u102d\\\\u102f \\\\u101e\\\\u1031...\", \"annot_utt\": \"\\\"\\\\u1004\\\\u102b\\\\u1037\\\\u1000\\\\u102d\\\\u102f [date : \\\\u10...\", \"worker_id\": \"\\\"33\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"90\\\", \\\"48\\\", \\\"39\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "nb-NO": {"config_name": "nb-NO", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"nb-NO\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"vekk meg ni null null p\\\\u00e5 fredag\\\"\", \"annot_utt\": \"\\\"vekk meg [time : ni null null] p\\\\u00e5 [date : fr...\", \"worker_id\": \"\\\"15\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"10\\\", \\\"19\\\", \\\"11\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "nl-NL": {"config_name": "nl-NL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"nl-NL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"maakt mijn wakker om negen uur in de ochtend op v...\", \"annot_utt\": \"\\\"maakt mijn wakker om [time : negen uur in de ocht...\", \"worker_id\": \"\\\"22\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"35\\\", \\\"34\\\", \\\"31\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 1, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "pl-PL": {"config_name": "pl-PL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"pl-PL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"obud\\\\u017a mnie o dziewi\\\\u0105tej rano w pi\\\\u0105...\", \"annot_utt\": \"\\\"obud\\\\u017a mnie o [time : dziewi\\\\u0105tej rano] w...\", \"worker_id\": \"\\\"9\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"21\\\", \\\"11\\\", \\\"5\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "pt-PT": {"config_name": "pt-PT", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"pt-PT\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"acorda-me \\\\u00e0s nove da manh\\\\u00e3 na sexta-fei...\", \"annot_utt\": \"\\\"acorda-me \\\\u00e0s [time : nove da manh\\\\u00e3] na ...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"6\\\", \\\"8\\\", \\\"12\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 2]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[1, 1, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ro-RO": {"config_name": "ro-RO", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ro-RO\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"trezeste-ma vineri la noua dimineata\\\"\", \"annot_utt\": \"\\\"trezeste-ma [date : vineri] la [time : noua dimin...\", \"worker_id\": \"\\\"6\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"63\\\", \\\"10\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ru-RU": {"config_name": "ru-RU", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ru-RU\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0440\\\\u0430\\\\u0437\\\\u0431\\\\u0443\\\\u0434\\\\u0438 \\\\u043c...\", \"annot_utt\": \"\\\"\\\\u0440\\\\u0430\\\\u0437\\\\u0431\\\\u0443\\\\u0434\\\\u0438 \\\\u043c...\", \"worker_id\": \"\\\"11\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"4\\\", \\\"32\\\", \\\"8\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sl-SL": {"config_name": "sl-SL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sl-SL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"zbudi me ob devetih zjutraj v petek\\\"\", \"annot_utt\": \"\\\"zbudi me ob [time : devetih zjutraj] v [date : pe...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"1\\\", \\\"13\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sq-AL": {"config_name": "sq-AL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sq-AL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"m\\\\u00eb zgjo t\\\\u00eb premten n\\\\u00eb n\\\\u00ebnt\\\\u0...\", \"annot_utt\": \"\\\"m\\\\u00eb zgjo [date : t\\\\u00eb premten] n\\\\u00eb [ti...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"2\\\", \\\"16\\\"]\", \"judgments.intent_score\": \"[1, 1, 2]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target|english\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sv-SE": {"config_name": "sv-SE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sv-SE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"v\\\\u00e4ck mig vid nio p\\\\u00e5 fredag\\\"\", \"annot_utt\": \"\\\"v\\\\u00e4ck mig vid [time : nio] p\\\\u00e5 [date : fr...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"20\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sw-KE": {"config_name": "sw-KE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sw-KE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"niamshe saa tatu asubuhi ijumaa\\\"\", \"annot_utt\": \"\\\"niamshe [time : saa tatu asubuhi] [date : ijumaa]...\", \"worker_id\": \"\\\"59\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"33\\\", \\\"1\\\", \\\"52\\\"]\", \"judgments.intent_score\": \"[1, 1, 2]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ta-IN": {"config_name": "ta-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ta-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0bb5\\\\u0bc6\\\\u0bb3\\\\u0bcd\\\\u0bb3\\\\u0bbf\\\\u0b95\\\\u0bcd\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0bb5\\\\u0bc6\\\\u0bb3\\\\u0bcd\\\\u0bb3\\\\u0bbf\\\\u0b9...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"23\\\", \\\"17\\\", \\\"13\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "te-IN": {"config_name": "te-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"te-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0c36\\\\u0c41\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c3e\\\\u0c30\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0c36\\\\u0c41\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c3...\", \"worker_id\": \"\\\"21\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"2\\\", \\\"15\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[1, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "th-TH": {"config_name": "th-TH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"th-TH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0e1b\\\\u0e25\\\\u0e38\\\\u0e01\\\\u0e09\\\\u0e31\\\\u0e19 \\\\u0e15...\", \"annot_utt\": \"\\\"\\\\u0e1b\\\\u0e25\\\\u0e38\\\\u0e01\\\\u0e09\\\\u0e31\\\\u0e19 \\\\u0e15...\", \"worker_id\": \"\\\"24\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"24\\\", \\\"35\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "tl-PH": {"config_name": "tl-PH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"tl-PH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"gisingin mo ako ng alas nuwebe ng umaga sa biyern...\", \"annot_utt\": \"\\\"gisingin mo ako ng [time : alas nuwebe ng umaga] ...\", \"worker_id\": \"\\\"17\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"19\\\", \\\"6\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "tr-TR": {"config_name": "tr-TR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"tr-TR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"beni cuma g\\\\u00fcn\\\\u00fc sabah dokuzda uyand\\\\u013...\", \"annot_utt\": \"\\\"beni [date : cuma] g\\\\u00fcn\\\\u00fc [time : sabah d...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"2\\\", \\\"4\\\", \\\"9\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ur-PK": {"config_name": "ur-PK", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ur-PK\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0645\\\\u062c\\\\u06be\\\\u06d2 \\\\u062c\\\\u0645\\\\u0639\\\\u06c1...\", \"annot_utt\": \"\\\"\\\\u0645\\\\u062c\\\\u06be\\\\u06d2 [date : \\\\u062c\\\\u0645\\\\u06...\", \"worker_id\": \"\\\"13\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"9\\\", \\\"13\\\", \\\"10\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "vi-VN": {"config_name": "vi-VN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"vi-VN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"g\\\\u1ecdi t\\\\u00f4i d\\\\u1eady l\\\\u00fac ch\\\\u00edn gi\\\\...\", \"annot_utt\": \"\\\"g\\\\u1ecdi t\\\\u00f4i d\\\\u1eady l\\\\u00fac [time : ch\\\\u0...\", \"worker_id\": \"\\\"36\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"36\\\", \\\"37\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "zh-CN": {"config_name": "zh-CN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"zh-CN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u661f\\\\u671f\\\\u4e94\\\\u65e9\\\\u4e0a\\\\u4e5d\\\\u70b9\\\\u53eb\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u661f\\\\u671f\\\\u4e94] \\\\u65e9\\\\u4e0a [time : ...\", \"worker_id\": \"\\\"5\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"36\\\", \\\"4\\\", \\\"12\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "zh-TW": {"config_name": "zh-TW", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"zh-TW\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u79ae\\\\u62dc\\\\u4e94\\\\u65e9\\\\u4e0a\\\\u4e5d\\\\u9ede\\\\u53eb\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u79ae\\\\u62dc\\\\u4e94] [time : \\\\u65e9\\\\u4e0a\\\\...\", \"worker_id\": \"\\\"49\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"6\\\", \\\"17\\\", \\\"44\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "all_1.1": {"config_name": "all_1.1", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:af-ZA", "multilinguality:am-ET", "multilinguality:ar-SA", "multilinguality:az-AZ", "multilinguality:bn-BD", "multilinguality:ca-ES", "multilinguality:cy-GB", "multilinguality:da-DK", "multilinguality:de-DE", "multilinguality:el-GR", "multilinguality:en-US", "multilinguality:es-ES", "multilinguality:fa-IR", "multilinguality:fi-FI", "multilinguality:fr-FR", "multilinguality:he-IL", "multilinguality:hi-IN", "multilinguality:hu-HU", "multilinguality:hy-AM", "multilinguality:id-ID", "multilinguality:is-IS", "multilinguality:it-IT", "multilinguality:ja-JP", "multilinguality:jv-ID", "multilinguality:ka-GE", "multilinguality:km-KH", "multilinguality:kn-IN", "multilinguality:ko-KR", "multilinguality:lv-LV", "multilinguality:ml-IN", "multilinguality:mn-MN", "multilinguality:ms-MY", "multilinguality:my-MM", "multilinguality:nb-NO", "multilinguality:nl-NL", "multilinguality:pl-PL", "multilinguality:pt-PT", "multilinguality:ro-RO", "multilinguality:ru-RU", "multilinguality:sl-SL", "multilinguality:sq-AL", "multilinguality:sv-SE", "multilinguality:sw-KE", "multilinguality:ta-IN", "multilinguality:te-IN", "multilinguality:th-TH", "multilinguality:tl-PH", "multilinguality:tr-TR", "multilinguality:ur-PK", "multilinguality:vi-VN", "multilinguality:zh-CN", "multilinguality:zh-TW", "source_datasets:original", "natural-language-understanding"], "is_gated": false}, "aps/dynahate": {"dataset_name": "aps/dynahate", "description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.", "downloads": 122, "configs": {"0.2.3": {"config_name": "0.2.3", "sample_row": "{\"acl.id\": \"\\\"acl1\\\"\", \"label\": \"1\", \"text\": \"\\\"dalits are lowlives\\\"\", \"X1\": \"1\", \"type\": \"\\\"notgiven\\\"\", \"target\": \"\\\"notgiven\\\"\", \"level\": \"\\\"original\\\"\", \"split\": \"\\\"train\\\"\", \"round.base\": \"1\", \"annotator\": \"\\\"GNZuCtwed3\\\"\", \"round\": \"\\\"1\\\"\", \"acl.id.matched\": \"\\\"NA\\\"\"}", "columns": ["acl_id", "label", "text", "X1", "type", "target", "level", "split", "round_base", "annotator", "round", "acl_id_matched"], "columns_mapping": {"acl.id": "acl_id", "label": "label", "text": "text", "X1": "X1", "type": "type", "target": "target", "level": "level", "split": "split", "round.base": "round_base", "annotator": "annotator", "round": "round", "acl.id.matched": "acl_id_matched"}, "dataset_description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.\n", "dataset_name": "aps/dynahate"}, "0.2.2": {"config_name": "0.2.2", "sample_row": "{\"acl.id\": \"\\\"acl1\\\"\", \"label\": \"1\", \"text\": \"\\\"dalits are lowlives\\\"\", \"X1\": \"1\", \"type\": \"\\\"notgiven\\\"\", \"target\": \"\\\"notgiven\\\"\", \"level\": \"\\\"original\\\"\", \"split\": \"\\\"train\\\"\", \"round.base\": \"1\", \"annotator\": \"\\\"GNZuCtwed3\\\"\", \"round\": \"\\\"1\\\"\", \"acl.id.matched\": \"\\\"NA\\\"\"}", "columns": ["acl_id", "label", "text", "X1", "type", "target", "level", "split", "round_base", "annotator", "round", "acl_id_matched"], "columns_mapping": {"acl.id": "acl_id", "label": "label", "text": "text", "X1": "X1", "type": "type", "target": "target", "level": "level", "split": "split", "round.base": "round_base", "annotator": "annotator", "round": "round", "acl.id.matched": "acl_id_matched"}, "dataset_description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.\n", "dataset_name": "aps/dynahate"}}, "tags": [], "is_gated": false}, "Filippo/osdg_cd": {"dataset_name": "Filippo/osdg_cd", "description": "The OSDG Community Dataset (OSDG-CD) is a public dataset of thousands of text excerpts, which were validated by approximately 1,000 OSDG Community Platform (OSDG-CP) citizen scientists from over 110 countries, with respect to the Sustainable Development Goals (SDGs).", "downloads": 33, "configs": {"main_config": {"config_name": "main_config", "sample_row": "{\"doi\": \"\\\"10.6027/9789289342698-7-en\\\"\", \"text_id\": \"\\\"00021941702cd84171ff33962197ca1f\\\"\", \"text\": \"\\\"From a gender perspective, Paulgaard points out t...\", \"sdg\": \"5\", \"label\": \"4\", \"labels_negative\": \"1\", \"labels_positive\": \"8\", \"agreement\": \"0.7777777777777778\"}", "columns": ["doi", "text_id", "text", "sdg", "label", "labels_negative", "labels_positive", "agreement"], "columns_mapping": {"doi": "doi", "text_id": "text_id", "text": "text", "sdg": "sdg", "label": "label", "labels_negative": "labels_negative", "labels_positive": "labels_positive", "agreement": "agreement"}, "dataset_description": "The OSDG Community Dataset (OSDG-CD) is a public dataset of thousands of text excerpts, which were validated by approximately 1,000 OSDG Community Platform (OSDG-CP) citizen scientists from over 110 countries, with respect to the Sustainable Development Goals (SDGs).\n", "dataset_name": "Filippo/osdg_cd"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "google/wit": {"dataset_name": "google/wit", "description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset.\nWIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages.\nIts size enables WIT to be used as a pretraining dataset for multimodal machine learning models.", "downloads": 30, "configs": {"default": {"config_name": "default", "sample_row": "{\"language\": \"\\\"en\\\"\", \"page_url\": \"\\\"https://en.wikipedia.org/wiki/Oxydactylus\\\"\", \"image_url\": \"\\\"https://upload.wikimedia.org/wikipedia/commons/5/...\", \"page_title\": \"\\\"Oxydactylus\\\"\", \"section_title\": \"null\", \"hierarchical_section_title\": \"\\\"Oxydactylus\\\"\", \"caption_reference_description\": \"null\", \"caption_attribution_description\": \"\\\"English: Mounted skeleton of Oxydactylus longipes...\", \"caption_alt_text_description\": \"null\", \"mime_type\": \"\\\"image/jpeg\\\"\", \"original_height\": \"3564\", \"original_width\": \"2748\", \"is_main_image\": \"true\", \"attribution_passes_lang_id\": \"true\", \"page_changed_recently\": \"true\", \"context_page_description\": \"\\\"Oxydactylus is an extinct genus of camelid endemi...\", \"context_section_description\": \"\\\"Oxydactylus is an extinct genus of camelid endemi...\"}", "columns": ["language", "page_url", "image_url", "page_title", "section_title", "hierarchical_section_title", "caption_reference_description", "caption_attribution_description", "caption_alt_text_description", "mime_type", "original_height", "original_width", "is_main_image", "attribution_passes_lang_id", "page_changed_recently", "context_page_description", "context_section_description"], "columns_mapping": {"language": "language", "page_url": "page_url", "image_url": "image_url", "page_title": "page_title", "section_title": "section_title", "hierarchical_section_title": "hierarchical_section_title", "caption_reference_description": "caption_reference_description", "caption_attribution_description": "caption_attribution_description", "caption_alt_text_description": "caption_alt_text_description", "mime_type": "mime_type", "original_height": "original_height", "original_width": "original_width", "is_main_image": "is_main_image", "attribution_passes_lang_id": "attribution_passes_lang_id", "page_changed_recently": "page_changed_recently", "context_page_description": "context_page_description", "context_section_description": "context_section_description"}, "dataset_description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset.\nWIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages.\nIts size enables WIT to be used as a pretraining dataset for multimodal machine learning models.\n", "dataset_name": "google/wit"}}, "tags": ["task_categories:text-retrieval", "task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "source_datasets:extended|wikipedia", "language:af", "language:ar", "language:ast", "language:azb", "language:be", "language:bg", "language:bn", "language:br", "language:ca", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:fy", "language:ga", "language:gl", "language:hr", "language:hu", "language:hy", "language:id", "language:it", "language:iw", "language:ja", "language:ka", "language:ko", "language:la", "language:lt", "language:lv", "language:mk", "language:ml", "language:ms", "language:nl", "language:nn", "language:no", "language:pl", "language:pt", "language:ro", "language:ru", "language:sk", "language:sl", "language:sr", "language:sv", "language:th", "language:tr", "language:uk", "language:ur", "language:vi", "language:vo", "language:zh"], "is_gated": false}, "arbml/masader": {"dataset_name": "arbml/masader", "description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes.", "downloads": 13, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"Name\": \"\\\"Shami\\\"\", \"Subsets\": \"[{\\\"Name\\\": \\\"Jordanian\\\", \\\"Dialect\\\": \\\"ar-JO: (Arabic ...\", \"HF Link\": \"\\\"https://huggingface.co/datasets/arbml/Shami\\\"\", \"Link\": \"\\\"https://github.com/GU-CLASP/shami-corpus\\\"\", \"License\": \"\\\"Apache-2.0\\\"\", \"Year\": \"2018\", \"Language\": \"\\\"ar\\\"\", \"Dialect\": \"\\\"ar-LEV: (Arabic(Levant))\\\"\", \"Domain\": \"\\\"social media\\\"\", \"Form\": \"\\\"text\\\"\", \"Collection Style\": \"\\\"crawling and annotation(other)\\\"\", \"Description\": \"\\\"the first Levantine Dialect Corpus (SDC) covering...\", \"Volume\": \"\\\"117,805\\\"\", \"Unit\": \"\\\"sentences\\\"\", \"Ethical Risks\": \"\\\"Medium\\\"\", \"Provider\": \"\\\"Multiple institutions \\\"\", \"Derived From\": \"\\\"nan\\\"\", \"Paper Title\": \"\\\"Shami: A Corpus of Levantine Arabic Dialects\\\"\", \"Paper Link\": \"\\\"https://aclanthology.org/L18-1576.pdf\\\"\", \"Script\": \"\\\"Arab\\\"\", \"Tokenized\": \"\\\"No\\\"\", \"Host\": \"\\\"GitHub\\\"\", \"Access\": \"\\\"Free\\\"\", \"Cost\": \"\\\"nan\\\"\", \"Test Split\": \"\\\"No\\\"\", \"Tasks\": \"\\\"dialect identification\\\"\", \"Venue Title\": \"\\\"LREC\\\"\", \"Citations\": \"\\\"25.0\\\"\", \"Venue Type\": \"\\\"conference\\\"\", \"Venue Name\": \"\\\"International Conference on Language Resources an...\", \"Authors\": \"\\\"Chatrine Qwaider,Motaz Saad,S. Chatzikyriakidis,S...\", \"Affiliations\": \"\\\",The Islamic University of Gaza,,\\\"\", \"Abstract\": \"\\\"Modern Standard Arabic (MSA) is the official lang...\", \"Added By\": \"\\\"nan\\\"\"}", "columns": ["Name", "Subsets", "HF Link", "Link", "License", "Year", "Language", "Dialect", "Domain", "Form", "Collection Style", "Description", "Volume", "Unit", "Ethical Risks", "Provider", "Derived From", "Paper Title", "Paper Link", "Script", "Tokenized", "Host", "Access", "Cost", "Test Split", "Tasks", "Venue Title", "Citations", "Venue Type", "Venue Name", "Authors", "Affiliations", "Abstract", "Added By"], "columns_mapping": {"Name": "Name", "Subsets": "Subsets", "HF Link": "HF Link", "Link": "Link", "License": "License", "Year": "Year", "Language": "Language", "Dialect": "Dialect", "Domain": "Domain", "Form": "Form", "Collection Style": "Collection Style", "Description": "Description", "Volume": "Volume", "Unit": "Unit", "Ethical Risks": "Ethical Risks", "Provider": "Provider", "Derived From": "Derived From", "Paper Title": "Paper Title", "Paper Link": "Paper Link", "Script": "Script", "Tokenized": "Tokenized", "Host": "Host", "Access": "Access", "Cost": "Cost", "Test Split": "Test Split", "Tasks": "Tasks", "Venue Title": "Venue Title", "Citations": "Citations", "Venue Type": "Venue Type", "Venue Name": "Venue Name", "Authors": "Authors", "Affiliations": "Affiliations", "Abstract": "Abstract", "Added By": "Added By"}, "dataset_description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes. \n", "dataset_name": "arbml/masader"}}, "tags": [], "is_gated": false}, "searle-j/kote": {"dataset_name": "searle-j/kote", "description": "50k Korean online comments labeled for 44 emotion categories.", "downloads": 42, "configs": {"dichotomized": {"config_name": "dichotomized", "sample_row": "{\"ID\": \"\\\"39087\\\"\", \"text\": \"\\\"\\\\ub0b4\\\\uac00 \\\\ud1b0\\\\ud589\\\\ud06c\\\\uc2a4\\\\ub97c \\\\uc88...\", \"labels\": \"[2, 13, 15, 16, 29, 39]\"}", "columns": ["ID", "text", "labels"], "columns_mapping": {"ID": "ID", "text": "text", "labels": "labels"}, "dataset_description": "50k Korean online comments labeled for 44 emotion categories.\n", "dataset_name": "searle-j/kote"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:kor"], "is_gated": false}, "taln-ls2n/kptimes": {"dataset_name": "taln-ls2n/kptimes", "description": "KPTimes benchmark dataset for keyphrase extraction an generation.", "downloads": 13, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"ny0282969\\\"\", \"title\": \"\\\"For Donald Trump\\\\u2019s Big Speech, an Added Pres...\", \"abstract\": \"\\\"CLEVELAND \\\\u2014 Until Monday night, Donald J. Tr...\", \"keyphrases\": \"[\\\"Donald Trump\\\", \\\"Speeches\\\", \\\"Plagiarism\\\", \\\"Melani...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"R\\\", \\\"M\\\"]\", \"date\": \"\\\"2016/07/21\\\"\", \"categories\": \"[\\\"us\\\", \\\"politics\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu", "date", "categories"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu", "date": "date", "categories": "categories"}, "dataset_description": "KPTimes benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kptimes"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "strombergnlp/rustance": {"dataset_name": "strombergnlp/rustance", "description": "This is a stance prediction dataset in Russian. The dataset contains comments on news articles,\nand rows are a comment, the title of the news article it responds to, and the stance of the comment\ntowards the article.", "downloads": 15, "configs": {"rustance": {"config_name": "rustance", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u0412\\\\u043e\\\\u043b\\\\u043a\\\\u0438, \\\\u0432\\\\u043e\\\\u043...\", \"title\": \"\\\"\\\\u041c\\\\u0438\\\\u043d\\\\u043e\\\\u0431\\\\u043e\\\\u0440\\\\u043e\\\\...\", \"stance\": \"3\"}", "columns": ["id", "text", "title", "stance"], "columns_mapping": {"id": "id", "text": "text", "title": "title", "stance": "stance"}, "dataset_description": "This is a stance prediction dataset in Russian. The dataset contains comments on news articles,\nand rows are a comment, the title of the news article it responds to, and the stance of the comment\ntowards the article.\n", "dataset_name": "strombergnlp/rustance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ru", "stance-detection"], "is_gated": false}, "ccdv/WCEP-10": {"dataset_name": "ccdv/WCEP-10", "description": "WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"", "downloads": 30, "configs": {"newline": {"config_name": "newline", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "roberta": {"config_name": "roberta", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "bert": {"config_name": "bert", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "list": {"config_name": "list", "sample_row": "{\"document\": \"[\\\"Rodrigo Duterte, the new president of the Philip...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}}, "tags": ["task_categories:summarization", "task_categories:text2text-generation", "multilinguality:monolingual", "language:en", "conditional-text-generation"], "is_gated": false}, "strombergnlp/nordic_langid": {"dataset_name": "strombergnlp/nordic_langid", "description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.", "downloads": 101, "configs": {"10k": {"config_name": "10k", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"den ble gitt charter av ranulf de blondeville\\\"\", \"language\": \"2\"}", "columns": ["id", "sentence", "language"], "columns_mapping": {"id": "id", "sentence": "sentence", "language": "language"}, "dataset_description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.\n\n", "dataset_name": "strombergnlp/nordic_langid"}, "50k": {"config_name": "50k", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"jackson er anerkjent som den mest suksessrike art...\", \"language\": \"2\"}", "columns": ["id", "sentence", "language"], "columns_mapping": {"id": "id", "sentence": "sentence", "language": "language"}, "dataset_description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.\n\n", "dataset_name": "strombergnlp/nordic_langid"}}, "tags": ["task_categories:text-classification", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:da", "language:nn", "language:nb", "language:fo", "language:is", "language:sv", "language-identification"], "is_gated": false}, "strombergnlp/bornholmsk_parallel": {"dataset_name": "strombergnlp/bornholmsk_parallel", "description": "This dataset is parallel text for Bornholmsk and Danish. \n\nFor more details, see the paper [Bornholmsk Natural Language Processing: Resources and Tools](https://aclanthology.org/W19-6138/).", "downloads": 116, "configs": {"BornholmskParallel": {"config_name": "BornholmskParallel", "sample_row": "{\"id\": \"\\\"0\\\"\", \"da_bornholm\": \"\\\"Hanj va ful \\\\u00e5 allera\\\"\", \"da\": \"\\\"Han var fuld af beundring\\\"\"}", "columns": ["id", "da_bornholm", "da"], "columns_mapping": {"id": "id", "da_bornholm": "da_bornholm", "da": "da"}, "dataset_description": "This dataset is parallel text for Bornholmsk and Danish. \n\nFor more details, see the paper [Bornholmsk Natural Language Processing: Resources and Tools](https://aclanthology.org/W19-6138/).\n", "dataset_name": "strombergnlp/bornholmsk_parallel"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original"], "is_gated": false}, "lmqg/qg_subjqa": {"dataset_name": "lmqg/qg_subjqa", "description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "downloads": 28, "configs": {"all": {"config_name": "all", "sample_row": "{\"answer\": \"\\\"any book that takes me 3 months and 20 different ...\", \"paragraph_question\": \"\\\"question: How is book?, context: I am giving \\\\\\\"Go...\", \"question\": \"\\\"How is book?\\\"\", \"sentence\": \"\\\"In my mind, any book that takes me 3 months and 2...\", \"paragraph\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"sentence_answer\": \"\\\"In my mind, any book that takes me 3 months ...\", \"paragraph_answer\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_sentence\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_id\": \"\\\"1b7cc3db9ec681edd253a41a2785b5a9\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"books\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "books": {"config_name": "books", "sample_row": "{\"answer\": \"\\\"any book that takes me 3 months and 20 different ...\", \"paragraph_question\": \"\\\"question: How is book?, context: I am giving \\\\\\\"Go...\", \"question\": \"\\\"How is book?\\\"\", \"sentence\": \"\\\"In my mind, any book that takes me 3 months and 2...\", \"paragraph\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"sentence_answer\": \"\\\"In my mind, any book that takes me 3 months ...\", \"paragraph_answer\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_sentence\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_id\": \"\\\"1b7cc3db9ec681edd253a41a2785b5a9\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"books\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "electronics": {"config_name": "electronics", "sample_row": "{\"answer\": \"\\\"the keyboard and its difficult to adjust the arm ...\", \"paragraph_question\": \"\\\"question: How would you describe the keyboard?, c...\", \"question\": \"\\\"How would you describe the keyboard?\\\"\", \"sentence\": \"\\\"First, when you try and open it you have to fight...\", \"paragraph\": \"\\\"The concept was good, the execution was terrible....\", \"sentence_answer\": \"\\\"First, when you try and open it you have to fight...\", \"paragraph_answer\": \"\\\"The concept was good, the execution was terrible....\", \"paragraph_sentence\": \"\\\"The concept was good, the execution was terrible....\", \"paragraph_id\": \"\\\"8d0cdd656a9e45b9acf198638711c4f6\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"electronics\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "grocery": {"config_name": "grocery", "sample_row": "{\"answer\": \"\\\"I love a deep, bold coffee but don't like acidic ...\", \"paragraph_question\": \"\\\"question: How do you like the coffee?, context: I...\", \"question\": \"\\\"How do you like the coffee?\\\"\", \"sentence\": \"\\\"I love a deep, bold coffee but don't like acidic ...\", \"paragraph\": \"\\\"I usually like to grind my own beans but tried th...\", \"sentence_answer\": \"\\\" I love a deep, bold coffee but don't like ac...\", \"paragraph_answer\": \"\\\"I usually like to grind my own beans but tried th...\", \"paragraph_sentence\": \"\\\"I usually like to grind my own beans but tried th...\", \"paragraph_id\": \"\\\"bf7314a2f905b2b72c358bfe556200f4\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"grocery\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "movies": {"config_name": "movies", "sample_row": "{\"answer\": \"\\\"when this movie first came out\\\"\", \"paragraph_question\": \"\\\"question: Is this movie recommended?, context: To...\", \"question\": \"\\\"Is this movie recommended?\\\"\", \"sentence\": \"\\\"To be honest, when this movie first came out , I ...\", \"paragraph\": \"\\\"To be honest, when this movie first came out, I r...\", \"sentence_answer\": \"\\\"To be honest, when this movie first came out...\", \"paragraph_answer\": \"\\\"To be honest, when this movie first came out...\", \"paragraph_sentence\": \"\\\" To be honest, when this movie first came out...\", \"paragraph_id\": \"\\\"5901dbf09ed091190bf05b54ce8d9d95\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"movies\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "restaurants": {"config_name": "restaurants", "sample_row": "{\"answer\": \"\\\"My wife's salad looked like it was fished out of ...\", \"paragraph_question\": \"\\\"question: Does this food stink others?, context: ...\", \"question\": \"\\\"Does this food stink others?\\\"\", \"sentence\": \"\\\"My wife's salad looked like it was fished out of ...\", \"paragraph\": \"\\\"We went here with our expectations quite low. Aft...\", \"sentence_answer\": \"\\\" My wife's salad looked like it was fished ou...\", \"paragraph_answer\": \"\\\"We went here with our expectations quite low. Aft...\", \"paragraph_sentence\": \"\\\"We went here with our expectations quite low. Aft...\", \"paragraph_id\": \"\\\"nQj2DGkomIWsKL6SRu8GGg\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"restaurants\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "tripadvisor": {"config_name": "tripadvisor", "sample_row": "{\"answer\": \"\\\"The lobby is Great\\\"\", \"paragraph_question\": \"\\\"question: How's the hotel lobby?, context: The lo...\", \"question\": \"\\\"How's the hotel lobby?\\\"\", \"sentence\": \"\\\"The lobby is Great , but it all ends there.\\\"\", \"paragraph\": \"\\\"The lobby is Great, but it all ends there. I was ...\", \"sentence_answer\": \"\\\" The lobby is Great , but it all ends th...\", \"paragraph_answer\": \"\\\" The lobby is Great , but it all ends the...\", \"paragraph_sentence\": \"\\\" The lobby is Great , but it all ends there. ...\", \"paragraph_id\": \"\\\"tripadvisor_review_3303\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"tripadvisor\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:subjqa", "language:en", "question-generation"], "is_gated": false}, "ncats/EpiSet4NER-v2": {"dataset_name": "ncats/EpiSet4NER-v2", "description": "**REWRITE*\nEpiSet4NER-2 is a dataset generated from 620 rare disease abstracts labeled using statistical and rule-base methods. \nFor more details see *INSERT PAPER* and https://github.com/ncats/epi4GARD/tree/master/EpiExtract4GARD#epiextract4gard", "downloads": 11, "configs": {"EpiSet4NER": {"config_name": "EpiSet4NER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Background\\\", \\\"Chemotherapy\\\", \\\"-\\\", \\\"induced\\\", \\\"ca...\", \"ner_tags\": \"[0, 1, 2, 2, 2, 0, 3, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "**REWRITE*\nEpiSet4NER-2 is a dataset generated from 620 rare disease abstracts labeled using statistical and rule-base methods. \nFor more details see *INSERT PAPER* and https://github.com/ncats/epi4GARD/tree/master/EpiExtract4GARD#epiextract4gard\n", "dataset_name": "ncats/EpiSet4NER-v2"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "epidemiology", "rare disease", "named entity recognition", "NER", "NIH"], "is_gated": false}, "strombergnlp/rumoureval_2019": {"dataset_name": "strombergnlp/rumoureval_2019", "description": "\nStance prediction task in English. The goal is to predict whether a given reply to a claim either supports, denies, questions, or simply comments on the claim. Ran as a SemEval task in 2019.", "downloads": 10, "configs": {"RumourEval2019": {"config_name": "RumourEval2019", "sample_row": "{\"id\": \"\\\"0\\\"\", \"source_text\": \"\\\"France: 10 people dead after shooting at HQ of sa...\", \"reply_text\": \"\\\"MT @euronews France: 10 dead after shooting at HQ...\", \"label\": \"3\"}", "columns": ["id", "source_text", "reply_text", "label"], "columns_mapping": {"id": "id", "source_text": "source_text", "reply_text": "reply_text", "label": "label"}, "dataset_description": "\nStance prediction task in English. The goal is to predict whether a given reply to a claim either supports, denies, questions, or simply comments on the claim. Ran as a SemEval task in 2019.\n", "dataset_name": "strombergnlp/rumoureval_2019"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "stance-detection"], "is_gated": false}, "HuggingFaceM4/webvid": {"dataset_name": "HuggingFaceM4/webvid", "description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.", "downloads": 986, "configs": {"2M": {"config_name": "2M", "sample_row": "{\"videoid\": \"31353427\", \"name\": \"\\\"Merida, mexico - may 23, 2017: tourists are walki...\", \"page_dir\": \"\\\"016401_016450\\\"\", \"duration\": \"15\", \"contentUrl\": \"\\\"https://ak.picdn.net/shutterstock/videos/31353427...\"}", "columns": ["videoid", "dataset_name", "page_dir", "duration", "contentUrl"], "columns_mapping": {"videoid": "videoid", "dataset_name": "dataset_name", "page_dir": "page_dir", "duration": "duration", "contentUrl": "contentUrl"}, "dataset_description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.\n", "dataset_name": "HuggingFaceM4/webvid"}, "10M": {"config_name": "10M", "sample_row": "{\"videoid\": \"21179416\", \"name\": \"\\\"Aerial shot winter forest\\\"\", \"page_dir\": \"\\\"006001_006050\\\"\", \"duration\": \"11\", \"contentUrl\": \"\\\"https://ak.picdn.net/shutterstock/videos/21179416...\"}", "columns": ["videoid", "dataset_name", "page_dir", "duration", "contentUrl"], "columns_mapping": {"videoid": "videoid", "dataset_name": "dataset_name", "page_dir": "page_dir", "duration": "duration", "contentUrl": "contentUrl"}, "dataset_description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.\n", "dataset_name": "HuggingFaceM4/webvid"}}, "tags": [], "is_gated": false}, "HuggingFaceM4/vatex": {"dataset_name": "HuggingFaceM4/vatex", "description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.", "downloads": 88, "configs": {"v1.1": {"config_name": "v1.1", "sample_row": "{\"videoID\": \"\\\"Ptf_2VRj-V0\\\"\", \"path\": \"\\\"https://www.youtube.com/watch?v=Ptf_2VRj-V0\\\"\", \"start\": \"122\", \"end\": \"132\", \"enCap\": \"[\\\"People wearing harnesses using ropes to climb up...\", \"chCap\": \"[\\\"\\\\u4e00\\\\u4e2a\\\\u5e26\\\\u7740\\\\u767d\\\\u8272\\\\u5b89\\\\u5168...\"}", "columns": ["videoID", "path", "start", "end", "enCap", "chCap"], "columns_mapping": {"videoID": "videoID", "path": "path", "start": "start", "end": "end", "enCap": "enCap", "chCap": "chCap"}, "dataset_description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.\n", "dataset_name": "HuggingFaceM4/vatex"}, "v1.0": {"config_name": "v1.0", "sample_row": "{\"videoID\": \"\\\"Ptf_2VRj-V0\\\"\", \"path\": \"\\\"https://www.youtube.com/watch?v=Ptf_2VRj-V0\\\"\", \"start\": \"122\", \"end\": \"132\", \"enCap\": \"[\\\"People wearing harnesses using ropes to climb up...\", \"chCap\": \"[\\\"\\\\u4e00\\\\u4e2a\\\\u5e26\\\\u7740\\\\u767d\\\\u8272\\\\u5b89\\\\u5168...\"}", "columns": ["videoID", "path", "start", "end", "enCap", "chCap"], "columns_mapping": {"videoID": "videoID", "path": "path", "start": "start", "end": "end", "enCap": "enCap", "chCap": "chCap"}, "dataset_description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.\n", "dataset_name": "HuggingFaceM4/vatex"}}, "tags": [], "is_gated": false}, "mwritescode/slither-audited-smart-contracts": {"dataset_name": "mwritescode/slither-audited-smart-contracts", "description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.", "downloads": 1980, "configs": {"all-plain-text": {"config_name": "all-plain-text", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "all-multilabel": {"config_name": "all-multilabel", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"[4]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "big-plain-text": {"config_name": "big-plain-text", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "big-multilabel": {"config_name": "big-multilabel", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"[1]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "small-plain-text": {"config_name": "small-plain-text", "sample_row": "{\"address\": \"\\\"0x01b23286ff60a543ec29366ae8d6b6274ca20541\\\"\", \"source_code\": \"\\\"pragma solidity 0.4.26;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n...\", \"bytecode\": \"\\\"0x608060405260043610610112576000357c0100000000000...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "small-multilabel": {"config_name": "small-multilabel", "sample_row": "{\"address\": \"\\\"0x01b23286ff60a543ec29366ae8d6b6274ca20541\\\"\", \"source_code\": \"\\\"pragma solidity 0.4.26;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n...\", \"bytecode\": \"\\\"0x608060405260043610610112576000357c0100000000000...\", \"slither\": \"[6]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}}, "tags": ["task_categories:text-classification", "task_categories:text-generation", "task_ids:multi-label-classification", "task_ids:multi-input-text-classification", "task_ids:language-modeling", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "wdc/products-2017": {"dataset_name": "wdc/products-2017", "description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.", "downloads": 265, "configs": {"computers_xlarge": {"config_name": "computers_xlarge", "sample_row": "{\"pair_id\": \"\\\"2551242#16272671\\\"\", \"label\": \"1\", \"id_left\": \"2551242\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"79457\", \"brand_left\": \"\\\"\\\\\\\"Corsair\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Corsair Vengeance LPX Black 64GB (4x16GB) DDR4...\", \"description_left\": \"\\\"\\\\\\\"DDR4, 2666MHz, CL16, 1.2v, XMP 2.0, Lifetime Wa...\", \"price_left\": \"null\", \"specTableContent_left\": \"\\\" Memory Type DDR4 (PC4-21300) Capacity 64GB (4 x ...\", \"id_right\": \"16272671\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"79457\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Corsair Vengeance LPX CMK64GX4M4A2666C16 - Pri...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Geheugen intern Merk Corsair Productse...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_large": {"config_name": "computers_large", "sample_row": "{\"pair_id\": \"\\\"10350670#11790323\\\"\", \"label\": \"0\", \"id_left\": \"10350670\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"95342\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"SilverStone ECM20 Adaptador PCIe a M.2\\\\\\\"@es M....\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"11790323\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"1450313\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Samsung 960 Pro 2TB - Prijzen \\\\\\\"@NL Tweakers\\\\\\\"...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Solid state drives Merk Samsung Produc...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_medium": {"config_name": "computers_medium", "sample_row": "{\"pair_id\": \"\\\"14219585#11723285\\\"\", \"label\": \"0\", \"id_left\": \"14219585\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"521249\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Apple - Mac Pro Desktop Computer 6-Core Intel\\\\...\", \"description_left\": \"\\\"\\\\\\\"Apple Mac Pro MD878LL/A Desktop: Designed for p...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"11723285\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"9835048\", \"brand_right\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"COMPAQ PL ML530R G3 Xeon 3.0GHz 1GB\\\\\\\", \\\\\\\"Null\\\\...\", \"description_right\": \"\\\"\\\\\\\"Description:\\\\n271246-001 Proliant Xeon 3.0GHz ...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_small": {"config_name": "computers_small", "sample_row": "{\"pair_id\": \"\\\"15745640#14832469\\\"\", \"label\": \"0\", \"id_left\": \"15745640\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"914299\", \"brand_left\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_left\": \"\\\" \\\\\\\"631674-B21 HP Smart Array P421/2GB Controller\\\\...\", \"description_left\": \"\\\" \\\\\\\"Description:HP Smart Array P421/2GB FBWC 6Gb2-...\", \"price_left\": \"\\\"\\\\\\\"CAD\\\\\\\", \\\\\\\"$605.74 CAD\\\\\\\"\\\"\", \"specTableContent_left\": \"\\\" Specifications: Category Proliant Controller Sub...\", \"id_right\": \"14832469\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"4293688\", \"brand_right\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"300680-B21 HP 2GB (2x1GB) 266MHz SDRAM Kit\\\\\\\", ...\", \"description_right\": \"\\\"\\\\\\\"Description:Genuine HPE 2GB (2x1GB) Registered ...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" 300680-B21\\\\u00a0Compatible Servers: BL20p G2 BL3...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_xlarge": {"config_name": "cameras_xlarge", "sample_row": "{\"pair_id\": \"\\\"11933246#14836018\\\"\", \"label\": \"0\", \"id_left\": \"11933246\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"1041874\", \"brand_left\": \"\\\"\\\\\\\"Canon\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Canon EOS 5D Mark IV DSLR Camera with 24-105mm...\", \"description_left\": \"\\\"\\\\\\\"\\\\n30.4MP Full-Frame CMOS Sensor\\\\nEF 24-105mm f/...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"14836018\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"197207\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\" C\\\\u00e1mara CANON EOS 6D + Lente EF 24-105L \\\\...\", \"description_right\": \"\\\"\\\\\\\"\\\\n C\\\\u00e1mara CANON...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Marca CANON Megapixeles 20.2 MP TAMA\\\\u00d1O DE P...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_large": {"config_name": "cameras_large", "sample_row": "{\"pair_id\": \"\\\"16965715#5931545\\\"\", \"label\": \"1\", \"id_left\": \"16965715\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"9309675\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Veho VCC-005 MUVI HD NPNG Body Camera/Action C...\", \"description_left\": \"\\\"\\\\\\\"\\\\n\\\\tHD video at 30fps & Up to 8MP Stills\\\\n\\\\t170...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5931545\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"9309675\", \"brand_right\": \"\\\"\\\\\\\"Veho\\\\\\\"@en-US\\\"\", \"title_right\": \"\\\" \\\\\\\"Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfree ...\", \"description_right\": \"\\\"\\\\\\\"Veho are pleased to announce the partnership wi...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_medium": {"config_name": "cameras_medium", "sample_row": "{\"pair_id\": \"\\\"16965715#5931545\\\"\", \"label\": \"1\", \"id_left\": \"16965715\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"9309675\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Veho VCC-005 MUVI HD NPNG Body Camera/Action C...\", \"description_left\": \"\\\"\\\\\\\"\\\\n\\\\tHD video at 30fps & Up to 8MP Stills\\\\n\\\\t170...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5931545\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"9309675\", \"brand_right\": \"\\\"\\\\\\\"Veho\\\\\\\"@en-US\\\"\", \"title_right\": \"\\\" \\\\\\\"Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfree ...\", \"description_right\": \"\\\"\\\\\\\"Veho are pleased to announce the partnership wi...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_small": {"config_name": "cameras_small", "sample_row": "{\"pair_id\": \"\\\"2900433#6082212\\\"\", \"label\": \"0\", \"id_left\": \"2900433\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"387759\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Transcend 64GB microSDXC UHS-I 300x, Class 10\\\\...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"6082212\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"368922\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"TARJETA SD 32GB SDHC CLASE 10 300X\\\\\\\" 300X | Tr...\", \"description_right\": \"\\\"\\\\\\\"Tipolog\\\\u00eda: Secure Digital analogico; Capac...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_xlarge": {"config_name": "watches_xlarge", "sample_row": "{\"pair_id\": \"\\\"2679850#3297990\\\"\", \"label\": \"0\", \"id_left\": \"2679850\", \"category_left\": \"\\\"Luggage_and_Travel_Gear\\\"\", \"cluster_id_left\": \"1719439\", \"brand_left\": \"\\\"\\\\\\\"\\\\n\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\tPrada\\\\n\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t...\", \"title_left\": \"\\\" \\\\\\\" Prada Papaya Saffiano Lux Leather Parabole To...\", \"description_left\": \"\\\"\\\\\\\"\\\\n This stunning Prada Papaya Saffiano L...\", \"price_left\": \"null\", \"specTableContent_left\": \"\\\" Shipping Method Estimated Transit Time Fee per O...\", \"id_right\": \"3297990\", \"category_right\": \"\\\"Sports_and_Outdoors\\\"\", \"cluster_id_right\": \"1631615\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"TomTom Runner 2 Cardio Large Zwart (Zwart) - P...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Wearables Merk TomTom Product TomTom R...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_large": {"config_name": "watches_large", "sample_row": "{\"pair_id\": \"\\\"50240#16579903\\\"\", \"label\": \"0\", \"id_left\": \"50240\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"8861668\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Bvlgari Bvlgari Watch BBL33WSPGD\\\\\\\" BBL37WSPG 1...\", \"description_left\": \"\\\"\\\\\\\"

A fine watch makes a brilliant statement of...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16579903\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"12081440\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Bvlgari Tubogas Watch SP35BDSDS.1T\\\\\\\" SP35BSPGD...\", \"description_right\": \"\\\"\\\\\\\"

A fine watch makes a magnificent statement ...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_medium": {"config_name": "watches_medium", "sample_row": "{\"pair_id\": \"\\\"8902994#16287862\\\"\", \"label\": \"0\", \"id_left\": \"8902994\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"1084360\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Mens Visodate Automatic Watch \\\\\\\"@de \\\\\\\"Tissot T...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16287862\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"871120\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Tissot T048.417.27.057.01 T-Race\\\\\\\"@es \\\\\\\"Reloj ...\", \"description_right\": \"\\\"\\\\\\\"Reloj Tissot T-Sport T-Race\\\\u00a0T0484172705701...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_small": {"config_name": "watches_small", "sample_row": "{\"pair_id\": \"\\\"17014053#13812379\\\"\", \"label\": \"0\", \"id_left\": \"17014053\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"11012750\", \"brand_left\": \"\\\"\\\\\\\"Rolex\\\\\\\"\\\"\", \"title_left\": \"\\\" \\\\\\\"Rolex Milgauss 116400 GV\\\\\\\" GV Watch | Watchfin...\", \"description_left\": \"\\\"\\\\\\\"This Rolex has undergone a thorough inspection ...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"13812379\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"3893035\", \"brand_right\": \"\\\"\\\\\\\"Cartier\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"Cartier Roadster W62004V3\\\\\\\" W62004V3 Watch | W...\", \"description_right\": \"\\\"\\\\\\\"This Cartier has undergone a thorough inspectio...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Name Monthly Repayment Total Amount Cost of Cred...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_xlarge": {"config_name": "shoes_xlarge", "sample_row": "{\"pair_id\": \"\\\"9725423#5777153\\\"\", \"label\": \"0\", \"id_left\": \"9725423\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"16023037\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Concave Volt + SG - Silver/Blue\\\\\\\"@en \\\\\\\" Concav...\", \"description_left\": \"\\\"\\\\\\\"\\\\n With a pu...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5777153\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"12588487\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike Tiempo Legend V FG - Blu Laguna/Bianco/Vo...\", \"description_right\": \"\\\"\\\\\\\"\\\\n Con una t...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_large": {"config_name": "shoes_large", "sample_row": "{\"pair_id\": \"\\\"1933376#3418973\\\"\", \"label\": \"0\", \"id_left\": \"1933376\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"14154487\", \"brand_left\": \"\\\"\\\\\\\"Nike\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Nike Air Max 90 Essential\\\\\\\"@en Essential Black...\", \"description_left\": \"\\\"\\\\\\\"Black/Wolf Grey-White\\\\n537384-053\\\\nFirst introd...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"3418973\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"14153817\", \"brand_right\": \"\\\"\\\\\\\"Nike\\\\\\\"@en\\\"\", \"title_right\": \"\\\" \\\\\\\"Nike Air Max 90 Essential\\\\\\\"@en Essential Unive...\", \"description_right\": \"\\\"\\\\\\\"University Blue/Pure Platinum-Obsidian-White\\\\n5...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_medium": {"config_name": "shoes_medium", "sample_row": "{\"pair_id\": \"\\\"8203003#16629600\\\"\", \"label\": \"0\", \"id_left\": \"8203003\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"8515872\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"NIKE AIR MAX 90 ULTRA 2.0 LTR\\\\\\\"@pl \\\\\\\"Nowo\\\\u015...\", \"description_left\": \"\\\"\\\\\\\"Null\\\\\\\"@pl \\\"\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16629600\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"3222506\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike Sportswear Air Max 90 Ultra Moire - Czarn...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_small": {"config_name": "shoes_small", "sample_row": "{\"pair_id\": \"\\\"5479787#15837383\\\"\", \"label\": \"1\", \"id_left\": \"5479787\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"2569194\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Nike Flex 2016 Run - Nero/Bianco/Grigio\\\\\\\"@it \\\\...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"15837383\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"2569194\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike sneaker flex 2016 rn\\\\\\\"@en-gb \\\\\\\"Tudo para ...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}}, "tags": ["task_categories:text-classification", "annotations_creators:weak supervision", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "strombergnlp/x-stance": {"dataset_name": "strombergnlp/x-stance", "description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.", "downloads": 69, "configs": {"de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question\": \"\\\"Eine Volksinitiative fordert, dass die Gesamtfl\\\\u...\", \"comment\": \"\\\"Eine fixe Gr\\\\u00f6sse verbieten, ist das falsche ...\", \"label\": \"0\"}", "columns": ["id", "question", "comment", "label"], "columns_mapping": {"id": "id", "question": "question", "comment": "comment", "label": "label"}, "dataset_description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.\n", "dataset_name": "strombergnlp/x-stance"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question\": \"\\\"Seriez-vous favorable \\\\u00e0 ce que l'euthanasie ...\", \"comment\": \"\\\"C'est un sujet d\\\\u00e9licat, tout d\\\\u00e9pend de ...\", \"label\": \"1\"}", "columns": ["id", "question", "comment", "label"], "columns_mapping": {"id": "id", "question": "question", "comment": "comment", "label": "label"}, "dataset_description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.\n", "dataset_name": "strombergnlp/x-stance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:multilingual", "language:de", "language:fr", "stance-detection"], "is_gated": false}, "WorkInTheDark/FairytaleQA": {"dataset_name": "WorkInTheDark/FairytaleQA", "description": "FairytaleQA dataset, an open-source dataset focusing on comprehension of narratives, targeting students from kindergarten to eighth grade. The FairytaleQA dataset is annotated by education experts based on an evidence-based theoretical framework. It consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations.", "downloads": 310, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"story_name\": \"\\\"three-dogs\\\"\", \"story_section\": \"\\\"once upon a time there was a king who went forth ...\", \"question\": \"\\\"why was there great rejoicing in the city and thr...\", \"answer1\": \"\\\"the people wished their king all that was good .\\\"...\", \"answer2\": \"\\\"\\\"\", \"local-or-sum\": \"\\\"local\\\"\", \"attribute\": \"\\\"causal relationship\\\"\", \"ex-or-im\": \"\\\"explicit\\\"\", \"ex-or-im2\": \"\\\"\\\"\"}", "columns": ["story_name", "story_section", "question", "answer1", "answer2", "local-or-sum", "attribute", "ex-or-im", "ex-or-im2"], "columns_mapping": {"story_name": "story_name", "story_section": "story_section", "question": "question", "answer1": "answer1", "answer2": "answer2", "local-or-sum": "local-or-sum", "attribute": "attribute", "ex-or-im": "ex-or-im", "ex-or-im2": "ex-or-im2"}, "dataset_description": "FairytaleQA dataset, an open-source dataset focusing on comprehension of narratives, targeting students from kindergarten to eighth grade. The FairytaleQA dataset is annotated by education experts based on an evidence-based theoretical framework. It consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations.\n", "dataset_name": "WorkInTheDark/FairytaleQA"}}, "tags": ["task_categories:question-answering", "task_categories:text-generation", "language:en", "education", "children education"], "is_gated": false}, "strombergnlp/nlpcc-stance": {"dataset_name": "strombergnlp/nlpcc-stance", "description": "This is a stance prediction dataset in Chinese.\nThe data is that from a shared task, stance detection in Chinese microblogs, in NLPCC-ICCPOL 2016. It covers Task A, a mandatory supervised task which detects stance towards five targets of interest with given labeled data.", "downloads": 25, "configs": {"task_a": {"config_name": "task_a", "sample_row": "{\"id\": \"\\\"0\\\"\", \"target\": \"\\\"IphoneSE\\\"\", \"text\": \"\\\"3\\\\u670831\\\\u65e5\\\\uff0c\\\\u82f9\\\\u679ciPhone SE\\\\u6b63\\\\...\", \"stance\": \"2\"}", "columns": ["id", "target", "text", "stance"], "columns_mapping": {"id": "id", "target": "target", "text": "text", "stance": "stance"}, "dataset_description": "This is a stance prediction dataset in Chinese.\nThe data is that from a shared task, stance detection in Chinese microblogs, in NLPCC-ICCPOL 2016. It covers Task A, a mandatory supervised task which detects stance towards five targets of interest with given labeled data. \n", "dataset_name": "strombergnlp/nlpcc-stance"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-analysis", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh", "stance-detection"], "is_gated": false}, "GEM/FairytaleQA": {"dataset_name": "GEM/FairytaleQA", "description": "\\\r\nThe FairytaleQA dataset focusing on narrative comprehension of kindergarten to eighth-grade students. Generated by educational experts based on an evidence-based theoretical framework, FairytaleQA consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations. This is for the Question Generation Task of FairytaleQA.", "downloads": 29, "configs": {"default": {"config_name": "default", "sample_row": "{\"story_name\": \"\\\"three-dogs\\\"\", \"content\": \"\\\"once upon a time there was a king who went forth ...\", \"answer\": \"\\\"the people wished their king all that was good .\\\"...\", \"question\": \"\\\"why was there great rejoicing in the city and thr...\", \"gem_id\": \"\\\"GEM-FairytaleQA-train-0\\\"\", \"target\": \"\\\"why was there great rejoicing in the city and thr...\", \"references\": \"[]\", \"local_or_sum\": \"\\\"local\\\"\", \"attribute\": \"\\\"causal relationship\\\"\", \"ex_or_im\": \"\\\"explicit\\\"\"}", "columns": ["story_name", "content", "answer", "question", "gem_id", "target", "references", "local_or_sum", "attribute", "ex_or_im"], "columns_mapping": {"story_name": "story_name", "content": "content", "answer": "answer", "question": "question", "gem_id": "gem_id", "target": "target", "references": "references", "local_or_sum": "local_or_sum", "attribute": "attribute", "ex_or_im": "ex_or_im"}, "dataset_description": "The FairytaleQA dataset focusing on narrative comprehension of kindergarten to eighth-grade students. Generated by educational experts based on an evidence-based theoretical framework, FairytaleQA consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations. This is for the Question Generation Task of FairytaleQA.\n", "dataset_name": "GEM/FairytaleQA"}}, "tags": ["task_categories:other", "annotations_creators:expert-created", "multilinguality:unknown", "source_datasets:original", "language:en", "question-generation"], "is_gated": false}, "strombergnlp/ans-stance": {"dataset_name": "strombergnlp/ans-stance", "description": "The dataset is a collection of news titles in arabic along with paraphrased and corrupted titles. The stance prediction version is a 3-class classification task. Data contains three columns: s1, s2, stance.", "downloads": 20, "configs": {"stance": {"config_name": "stance", "sample_row": "{\"id\": \"\\\"0\\\"\", \"s1\": \"\\\"\\\\u0647\\\\u062c\\\\u0648\\\\u0645 \\\\u0635\\\\u0627\\\\u0631\\\\u0648...\", \"s2\": \"\\\"\\\\u0647\\\\u062f\\\\u0648\\\\u0621 \\\\u0627\\\\u0644\\\\u0627\\\\u0634...\", \"stance\": \"0\"}", "columns": ["id", "s1", "s2", "stance"], "columns_mapping": {"id": "id", "s1": "s1", "s2": "s2", "stance": "stance"}, "dataset_description": "The dataset is a collection of news titles in arabic along with paraphrased and corrupted titles. The stance prediction version is a 3-class classification task. Data contains three columns: s1, s2, stance.\n", "dataset_name": "strombergnlp/ans-stance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar", "stance-detection"], "is_gated": false}, "launch/gov_report": {"dataset_name": "launch/gov_report", "description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure", "downloads": 1045, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document\": \"\\\"Background The structure of the armed forces is b...\", \"summary\": \"\\\"As the Department of Defense (DOD) has expanded i...\"}", "columns": ["id", "document", "summary"], "columns_mapping": {"id": "id", "document": "document", "summary": "summary"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}, "plain_text_with_recommendations": {"config_name": "plain_text_with_recommendations", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document\": \"\\\"Background The structure of the armed forces is b...\", \"summary\": \"\\\"As the Department of Defense (DOD) has expanded i...\"}", "columns": ["id", "document", "summary"], "columns_mapping": {"id": "id", "document": "document", "summary": "summary"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}, "structure": {"config_name": "structure", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document_sections.title\": \"[\\\"Background\\\", \\\"DOD Has Established Force Health P...\", \"document_sections.paragraphs\": \"[\\\"The structure of the armed forces is based on th...\", \"document_sections.depth\": \"[1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 2, 3, 3, 3, 2, 1, 1...\", \"summary_sections.title\": \"[\\\"Why GAO Did This Study\\\", \\\"What GAO Found\\\"]\", \"summary_sections.paragraphs\": \"[\\\"As the Department of Defense (DOD) has expanded ...\"}", "columns": ["id", "document_sections_title", "document_sections_paragraphs", "document_sections_depth", "summary_sections_title", "summary_sections_paragraphs"], "columns_mapping": {"id": "id", "document_sections.title": "document_sections_title", "document_sections.paragraphs": "document_sections_paragraphs", "document_sections.depth": "document_sections_depth", "summary_sections.title": "summary_sections_title", "summary_sections.paragraphs": "summary_sections_paragraphs"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}}, "tags": ["task_categories:summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ekinakyurek/ftrace": {"dataset_name": "ekinakyurek/ftrace", "description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.", "downloads": 91, "configs": {"abstracts": {"config_name": "abstracts", "sample_row": "{\"inputs_pretokenized\": \"\\\"The Austroasiatic languages, in recent classifica...\", \"targets_pretokenized\": \"\\\" Bangladesh\\\"\", \"masked_uri\": \"\\\"Q902\\\"\", \"masked_type\": \"\\\"object\\\"\", \"facts\": \"\\\"P31,Q25295,Q33199;P47,Q902,Q668;P47,Q837,Q668;P47...\", \"id\": \"\\\"3\\\"\", \"example_uris\": \"\\\"Q33199-0-Q902-Q668-0;Q33199-0-Q668-Q902-1\\\"\", \"page_uri\": \"\\\"Q33199\\\"\"}", "columns": ["inputs_pretokenized", "targets_pretokenized", "masked_uri", "masked_type", "facts", "id", "example_uris", "page_uri"], "columns_mapping": {"inputs_pretokenized": "inputs_pretokenized", "targets_pretokenized": "targets_pretokenized", "masked_uri": "masked_uri", "masked_type": "masked_type", "facts": "facts", "id": "id", "example_uris": "example_uris", "page_uri": "page_uri"}, "dataset_description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.\n Abstracts based on TREx dataset.\n", "dataset_name": "ekinakyurek/ftrace"}, "queries": {"config_name": "queries", "sample_row": "{\"inputs_pretokenized\": \"\\\"Member of the Scottish Parliament is a legal term...\", \"targets_pretokenized\": \"\\\" Scotland\\\"\", \"uuid\": \"\\\"0eb8ef92-c539-4498-b845-0c7f6d415b71\\\"\", \"obj_uri\": \"\\\"Q22\\\"\", \"sub_uri\": \"\\\"Q1711695\\\"\", \"predicate_id\": \"\\\"P1001\\\"\", \"sub_surface\": \"\\\"Member of the Scottish Parliament\\\"\", \"obj_surface\": \"\\\"Scotland\\\"\"}", "columns": ["inputs_pretokenized", "targets_pretokenized", "uuid", "obj_uri", "sub_uri", "predicate_id", "sub_surface", "obj_surface"], "columns_mapping": {"inputs_pretokenized": "inputs_pretokenized", "targets_pretokenized": "targets_pretokenized", "uuid": "uuid", "obj_uri": "obj_uri", "sub_uri": "sub_uri", "predicate_id": "predicate_id", "sub_surface": "sub_surface", "obj_surface": "obj_surface"}, "dataset_description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.\n Queries based on LAMA dataset.\n", "dataset_name": "ekinakyurek/ftrace"}}, "tags": ["task_ids:masked-language-modeling", "multilinguality:monolingual", "source_datasets:TRex", "source_datasets:Lama", "language:en"], "is_gated": false}, "GroNLP/divemt": {"dataset_name": "GroNLP/divemt", "description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.", "downloads": 32, "configs": {"warmup": {"config_name": "warmup", "sample_row": "{\"unit_id\": \"\\\"flores101-warmup-tur-1-ht-1\\\"\", \"flores_id\": \"163\", \"item_id\": \"\\\"flores101-warmup-11\\\"\", \"subject_id\": \"\\\"tur_t1\\\"\", \"lang_id\": \"\\\"tur\\\"\", \"doc_id\": \"1\", \"task_type\": \"\\\"ht\\\"\", \"translation_type\": \"\\\"ht\\\"\", \"src_len_chr\": \"189\", \"mt_len_chr\": \"NaN\", \"tgt_len_chr\": \"192\", \"src_len_wrd\": \"29\", \"mt_len_wrd\": \"NaN\", \"tgt_len_wrd\": \"27\", \"edit_time\": \"88.624\", \"k_total\": \"660\", \"k_letter\": \"472\", \"k_digit\": \"0\", \"k_white\": \"68\", \"k_symbol\": \"14\", \"k_nav\": \"72\", \"k_erase\": \"34\", \"k_copy\": \"0\", \"k_cut\": \"0\", \"k_paste\": \"0\", \"k_do\": \"0\", \"n_pause_geq_300\": \"37\", \"len_pause_geq_300\": \"728697\", \"n_pause_geq_1000\": \"14\", \"len_pause_geq_1000\": \"716844\", \"event_time\": \"761283\", \"num_annotations\": \"1\", \"last_modification_time\": \"1642600367\", \"n_insert\": \"NaN\", \"n_delete\": \"NaN\", \"n_substitute\": \"NaN\", \"n_shift\": \"NaN\", \"tot_shifted_words\": \"NaN\", \"tot_edits\": \"NaN\", \"hter\": \"NaN\", \"cer\": \"NaN\", \"bleu\": \"NaN\", \"chrf\": \"NaN\", \"time_s\": \"761.283\", \"time_m\": \"12.688\", \"time_h\": \"0.2115\", \"time_per_char\": \"4.028\", \"time_per_word\": \"26.2511\", \"key_per_char\": \"3.4921\", \"words_per_hour\": \"137.1369\", \"words_per_minute\": \"2.2856\", \"per_subject_visit_order\": \"1\", \"src_text\": \"\\\"In France, voting has traditionally been a low-te...\", \"mt_text\": \"\\\"nan\\\"\", \"tgt_text\": \"\\\"Fransa'da oy verme deneyimi geleneksel olarak pek...\", \"aligned_edit\": \"\\\"nan\\\"\", \"src_tokens\": \"[\\\"In\\\", \\\"France\\\", \\\",\\\", \\\"voting\\\", \\\"has\\\", \\\"traditiona...\", \"src_annotations.lemma\": \"[\\\"in\\\", \\\"France\\\", \\\",\\\", \\\"voting\\\", \\\"have\\\", \\\"tradition...\", \"src_annotations.upos\": \"[\\\"ADP\\\", \\\"PROPN\\\", \\\"PUNCT\\\", \\\"NOUN\\\", \\\"AUX\\\", \\\"ADV\\\", \\\"A...\", \"src_annotations.feats\": \"[\\\"\\\", \\\"Number=Sing\\\", \\\"\\\", \\\"Number=Sing\\\", \\\"Mood=Ind|N...\", \"src_annotations.head\": \"[\\\"2\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"1...\", \"src_annotations.deprel\": \"[\\\"case\\\", \\\"obl\\\", \\\"punct\\\", \\\"nsubj\\\", \\\"aux\\\", \\\"advmod\\\",...\", \"src_annotations.start_char\": \"[0, 3, 9, 11, 18, 22, 36, 41, 43, 46, 47, 52, 62, ...\", \"src_annotations.end_char\": \"[2, 9, 10, 17, 21, 35, 40, 42, 46, 47, 51, 62, 63,...\", \"src_annotations.ner\": \"[\\\"O\\\", \\\"S-GPE\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", ...\", \"mt_tokens\": \"[]\", \"mt_annotations.lemma\": \"[]\", \"mt_annotations.upos\": \"[]\", \"mt_annotations.feats\": \"[]\", \"mt_annotations.head\": \"[]\", \"mt_annotations.deprel\": \"[]\", \"mt_annotations.start_char\": \"[]\", \"mt_annotations.end_char\": \"[]\", \"mt_annotations.ner\": \"[]\", \"tgt_tokens\": \"[\\\"Fransa'da\\\", \\\"oy\\\", \\\"verme\\\", \\\"deneyimi\\\", \\\"geleneks...\", \"tgt_annotations.lemma\": \"[\\\"Fransa\\\", \\\"oy\\\", \\\"ver\\\", \\\"deneyim\\\", \\\"geleneksel\\\", \\\"...\", \"tgt_annotations.upos\": \"[\\\"PROPN\\\", \\\"NOUN\\\", \\\"VERB\\\", \\\"NOUN\\\", \\\"ADJ\\\", \\\"ADP\\\", \\\"A...\", \"tgt_annotations.feats\": \"[\\\"Case=Loc|Number=Sing|Person=3\\\", \\\"Case=Nom|Number...\", \"tgt_annotations.head\": \"[\\\"2\\\", \\\"4\\\", \\\"2\\\", \\\"11\\\", \\\"9\\\", \\\"5\\\", \\\"8\\\", \\\"9\\\", \\\"11\\\", \\\"1...\", \"tgt_annotations.deprel\": \"[\\\"nmod\\\", \\\"nmod:poss\\\", \\\"compound\\\", \\\"nsubj\\\", \\\"amod\\\",...\", \"tgt_annotations.start_char\": \"[0, 10, 13, 19, 28, 39, 46, 50, 61, 69, 73, 81, 83...\", \"tgt_annotations.end_char\": \"[9, 12, 18, 27, 38, 45, 49, 60, 68, 72, 81, 82, 85...\", \"tgt_annotations.ner\": \"[\\\"S-LOCATION\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", ...\", \"src_wmt22_qe\": \"[]\", \"mt_wmt22_qe\": \"[]\"}", "columns": ["unit_id", "flores_id", "item_id", "subject_id", "lang_id", "doc_id", "task_type", "translation_type", "src_len_chr", "mt_len_chr", "tgt_len_chr", "src_len_wrd", "mt_len_wrd", "tgt_len_wrd", "edit_time", "k_total", "k_letter", "k_digit", "k_white", "k_symbol", "k_nav", "k_erase", "k_copy", "k_cut", "k_paste", "k_do", "n_pause_geq_300", "len_pause_geq_300", "n_pause_geq_1000", "len_pause_geq_1000", "event_time", "num_annotations", "last_modification_time", "n_insert", "n_delete", "n_substitute", "n_shift", "tot_shifted_words", "tot_edits", "hter", "cer", "bleu", "chrf", "time_s", "time_m", "time_h", "time_per_char", "time_per_word", "key_per_char", "words_per_hour", "words_per_minute", "per_subject_visit_order", "src_text", "mt_text", "tgt_text", "aligned_edit", "src_tokens", "src_annotations_lemma", "src_annotations_upos", "src_annotations_feats", "src_annotations_head", "src_annotations_deprel", "src_annotations_start_char", "src_annotations_end_char", "src_annotations_ner", "mt_tokens", "mt_annotations_lemma", "mt_annotations_upos", "mt_annotations_feats", "mt_annotations_head", "mt_annotations_deprel", "mt_annotations_start_char", "mt_annotations_end_char", "mt_annotations_ner", "tgt_tokens", "tgt_annotations_lemma", "tgt_annotations_upos", "tgt_annotations_feats", "tgt_annotations_head", "tgt_annotations_deprel", "tgt_annotations_start_char", "tgt_annotations_end_char", "tgt_annotations_ner", "src_wmt22_qe", "mt_wmt22_qe"], "columns_mapping": {"unit_id": "unit_id", "flores_id": "flores_id", "item_id": "item_id", "subject_id": "subject_id", "lang_id": "lang_id", "doc_id": "doc_id", "task_type": "task_type", "translation_type": "translation_type", "src_len_chr": "src_len_chr", "mt_len_chr": "mt_len_chr", "tgt_len_chr": "tgt_len_chr", "src_len_wrd": "src_len_wrd", "mt_len_wrd": "mt_len_wrd", "tgt_len_wrd": "tgt_len_wrd", "edit_time": "edit_time", "k_total": "k_total", "k_letter": "k_letter", "k_digit": "k_digit", "k_white": "k_white", "k_symbol": "k_symbol", "k_nav": "k_nav", "k_erase": "k_erase", "k_copy": "k_copy", "k_cut": "k_cut", "k_paste": "k_paste", "k_do": "k_do", "n_pause_geq_300": "n_pause_geq_300", "len_pause_geq_300": "len_pause_geq_300", "n_pause_geq_1000": "n_pause_geq_1000", "len_pause_geq_1000": "len_pause_geq_1000", "event_time": "event_time", "num_annotations": "num_annotations", "last_modification_time": "last_modification_time", "n_insert": "n_insert", "n_delete": "n_delete", "n_substitute": "n_substitute", "n_shift": "n_shift", "tot_shifted_words": "tot_shifted_words", "tot_edits": "tot_edits", "hter": "hter", "cer": "cer", "bleu": "bleu", "chrf": "chrf", "time_s": "time_s", "time_m": "time_m", "time_h": "time_h", "time_per_char": "time_per_char", "time_per_word": "time_per_word", "key_per_char": "key_per_char", "words_per_hour": "words_per_hour", "words_per_minute": "words_per_minute", "per_subject_visit_order": "per_subject_visit_order", "src_text": "src_text", "mt_text": "mt_text", "tgt_text": "tgt_text", "aligned_edit": "aligned_edit", "src_tokens": "src_tokens", "src_annotations.lemma": "src_annotations_lemma", "src_annotations.upos": "src_annotations_upos", "src_annotations.feats": "src_annotations_feats", "src_annotations.head": "src_annotations_head", "src_annotations.deprel": "src_annotations_deprel", "src_annotations.start_char": "src_annotations_start_char", "src_annotations.end_char": "src_annotations_end_char", "src_annotations.ner": "src_annotations_ner", "mt_tokens": "mt_tokens", "mt_annotations.lemma": "mt_annotations_lemma", "mt_annotations.upos": "mt_annotations_upos", "mt_annotations.feats": "mt_annotations_feats", "mt_annotations.head": "mt_annotations_head", "mt_annotations.deprel": "mt_annotations_deprel", "mt_annotations.start_char": "mt_annotations_start_char", "mt_annotations.end_char": "mt_annotations_end_char", "mt_annotations.ner": "mt_annotations_ner", "tgt_tokens": "tgt_tokens", "tgt_annotations.lemma": "tgt_annotations_lemma", "tgt_annotations.upos": "tgt_annotations_upos", "tgt_annotations.feats": "tgt_annotations_feats", "tgt_annotations.head": "tgt_annotations_head", "tgt_annotations.deprel": "tgt_annotations_deprel", "tgt_annotations.start_char": "tgt_annotations_start_char", "tgt_annotations.end_char": "tgt_annotations_end_char", "tgt_annotations.ner": "tgt_annotations_ner", "src_wmt22_qe": "src_wmt22_qe", "mt_wmt22_qe": "mt_wmt22_qe"}, "dataset_description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.\n", "dataset_name": "GroNLP/divemt"}, "main": {"config_name": "main", "sample_row": "{\"unit_id\": \"\\\"flores101-main-tur-1-ht-1\\\"\", \"flores_id\": \"205\", \"item_id\": \"\\\"flores101-main-11\\\"\", \"subject_id\": \"\\\"tur_t1\\\"\", \"lang_id\": \"\\\"tur\\\"\", \"doc_id\": \"1\", \"task_type\": \"\\\"ht\\\"\", \"translation_type\": \"\\\"ht\\\"\", \"src_len_chr\": \"155\", \"mt_len_chr\": \"NaN\", \"tgt_len_chr\": \"147\", \"src_len_wrd\": \"25\", \"mt_len_wrd\": \"NaN\", \"tgt_len_wrd\": \"18\", \"edit_time\": \"100.306\", \"k_total\": \"260\", \"k_letter\": \"181\", \"k_digit\": \"4\", \"k_white\": \"26\", \"k_symbol\": \"5\", \"k_nav\": \"20\", \"k_erase\": \"24\", \"k_copy\": \"0\", \"k_cut\": \"0\", \"k_paste\": \"0\", \"k_do\": \"0\", \"n_pause_geq_300\": \"34\", \"len_pause_geq_300\": \"66281\", \"n_pause_geq_1000\": \"10\", \"len_pause_geq_1000\": \"52888\", \"event_time\": \"100306\", \"num_annotations\": \"1\", \"last_modification_time\": \"1643314612\", \"n_insert\": \"NaN\", \"n_delete\": \"NaN\", \"n_substitute\": \"NaN\", \"n_shift\": \"NaN\", \"tot_shifted_words\": \"NaN\", \"tot_edits\": \"NaN\", \"hter\": \"NaN\", \"cer\": \"NaN\", \"bleu\": \"NaN\", \"chrf\": \"NaN\", \"time_s\": \"100.306\", \"time_m\": \"1.6718\", \"time_h\": \"0.0279\", \"time_per_char\": \"0.6471\", \"time_per_word\": \"4.0122\", \"key_per_char\": \"1.6774\", \"words_per_hour\": \"897.2544\", \"words_per_minute\": \"14.9542\", \"per_subject_visit_order\": \"1\", \"src_text\": \"\\\"UN peacekeepers, whom arrived in Haiti after the ...\", \"mt_text\": \"\\\"nan\\\"\", \"tgt_text\": \"\\\"2010 depreminden sonra Haiti'ye giden BM arabuluc...\", \"aligned_edit\": \"\\\"nan\\\"\", \"src_tokens\": \"[\\\"UN\\\", \\\"peacekeepers\\\", \\\",\\\", \\\"whom\\\", \\\"arrived\\\", \\\"in...\", \"src_annotations.lemma\": \"[\\\"UN\\\", \\\"peacekeeper\\\", \\\",\\\", \\\"whom\\\", \\\"arrive\\\", \\\"in\\\",...\", \"src_annotations.upos\": \"[\\\"PROPN\\\", \\\"NOUN\\\", \\\"PUNCT\\\", \\\"PRON\\\", \\\"VERB\\\", \\\"ADP\\\", ...\", \"src_annotations.feats\": \"[\\\"Number=Sing\\\", \\\"Number=Plur\\\", \\\"\\\", \\\"PronType=Rel\\\",...\", \"src_annotations.head\": \"[\\\"2\\\", \\\"15\\\", \\\"5\\\", \\\"5\\\", \\\"2\\\", \\\"7\\\", \\\"5\\\", \\\"11\\\", \\\"11\\\", \\\"...\", \"src_annotations.deprel\": \"[\\\"compound\\\", \\\"nsubj:pass\\\", \\\"punct\\\", \\\"nsubj\\\", \\\"acl:...\", \"src_annotations.start_char\": \"[0, 3, 15, 17, 22, 30, 33, 39, 45, 49, 54, 64, 66,...\", \"src_annotations.end_char\": \"[2, 15, 16, 21, 29, 32, 38, 44, 48, 53, 64, 65, 69...\", \"src_annotations.ner\": \"[\\\"S-ORG\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"S-GPE\\\", \\\"O\\\", \\\"...\", \"mt_tokens\": \"[]\", \"mt_annotations.lemma\": \"[]\", \"mt_annotations.upos\": \"[]\", \"mt_annotations.feats\": \"[]\", \"mt_annotations.head\": \"[]\", \"mt_annotations.deprel\": \"[]\", \"mt_annotations.start_char\": \"[]\", \"mt_annotations.end_char\": \"[]\", \"mt_annotations.ner\": \"[]\", \"tgt_tokens\": \"[\\\"2010\\\", \\\"depreminden\\\", \\\"sonra\\\", \\\"Haiti'ye\\\", \\\"gide...\", \"tgt_annotations.lemma\": \"[\\\"2010\\\", \\\"deprem\\\", \\\"sonra\\\", \\\"Haiti\\\", \\\"git\\\", \\\"Bm\\\", ...\", \"tgt_annotations.upos\": \"[\\\"NUM\\\", \\\"NOUN\\\", \\\"ADP\\\", \\\"PROPN\\\", \\\"VERB\\\", \\\"NOUN\\\", \\\"N...\", \"tgt_annotations.feats\": \"[\\\"Case=Nom|NumType=Card|Number=Sing|Person=3\\\", \\\"Ca...\", \"tgt_annotations.head\": \"[\\\"5\\\", \\\"1\\\", \\\"2\\\", \\\"5\\\", \\\"7\\\", \\\"7\\\", \\\"10\\\", \\\"9\\\", \\\"10\\\", \\\"1...\", \"tgt_annotations.deprel\": \"[\\\"nummod\\\", \\\"flat\\\", \\\"case\\\", \\\"obl\\\", \\\"acl\\\", \\\"nmod:pos...\", \"tgt_annotations.start_char\": \"[0, 5, 17, 23, 32, 38, 41, 58, 67, 75, 85, 94, 102...\", \"tgt_annotations.end_char\": \"[4, 16, 22, 31, 37, 40, 57, 66, 74, 84, 93, 101, 1...\", \"tgt_annotations.ner\": \"[\\\"S-TIME\\\", \\\"O\\\", \\\"O\\\", \\\"S-LOCATION\\\", \\\"O\\\", \\\"S-ORGANIZ...\", \"src_wmt22_qe\": \"[]\", \"mt_wmt22_qe\": \"[]\"}", "columns": ["unit_id", "flores_id", "item_id", "subject_id", "lang_id", "doc_id", "task_type", "translation_type", "src_len_chr", "mt_len_chr", "tgt_len_chr", "src_len_wrd", "mt_len_wrd", "tgt_len_wrd", "edit_time", "k_total", "k_letter", "k_digit", "k_white", "k_symbol", "k_nav", "k_erase", "k_copy", "k_cut", "k_paste", "k_do", "n_pause_geq_300", "len_pause_geq_300", "n_pause_geq_1000", "len_pause_geq_1000", "event_time", "num_annotations", "last_modification_time", "n_insert", "n_delete", "n_substitute", "n_shift", "tot_shifted_words", "tot_edits", "hter", "cer", "bleu", "chrf", "time_s", "time_m", "time_h", "time_per_char", "time_per_word", "key_per_char", "words_per_hour", "words_per_minute", "per_subject_visit_order", "src_text", "mt_text", "tgt_text", "aligned_edit", "src_tokens", "src_annotations_lemma", "src_annotations_upos", "src_annotations_feats", "src_annotations_head", "src_annotations_deprel", "src_annotations_start_char", "src_annotations_end_char", "src_annotations_ner", "mt_tokens", "mt_annotations_lemma", "mt_annotations_upos", "mt_annotations_feats", "mt_annotations_head", "mt_annotations_deprel", "mt_annotations_start_char", "mt_annotations_end_char", "mt_annotations_ner", "tgt_tokens", "tgt_annotations_lemma", "tgt_annotations_upos", "tgt_annotations_feats", "tgt_annotations_head", "tgt_annotations_deprel", "tgt_annotations_start_char", "tgt_annotations_end_char", "tgt_annotations_ner", "src_wmt22_qe", "mt_wmt22_qe"], "columns_mapping": {"unit_id": "unit_id", "flores_id": "flores_id", "item_id": "item_id", "subject_id": "subject_id", "lang_id": "lang_id", "doc_id": "doc_id", "task_type": "task_type", "translation_type": "translation_type", "src_len_chr": "src_len_chr", "mt_len_chr": "mt_len_chr", "tgt_len_chr": "tgt_len_chr", "src_len_wrd": "src_len_wrd", "mt_len_wrd": "mt_len_wrd", "tgt_len_wrd": "tgt_len_wrd", "edit_time": "edit_time", "k_total": "k_total", "k_letter": "k_letter", "k_digit": "k_digit", "k_white": "k_white", "k_symbol": "k_symbol", "k_nav": "k_nav", "k_erase": "k_erase", "k_copy": "k_copy", "k_cut": "k_cut", "k_paste": "k_paste", "k_do": "k_do", "n_pause_geq_300": "n_pause_geq_300", "len_pause_geq_300": "len_pause_geq_300", "n_pause_geq_1000": "n_pause_geq_1000", "len_pause_geq_1000": "len_pause_geq_1000", "event_time": "event_time", "num_annotations": "num_annotations", "last_modification_time": "last_modification_time", "n_insert": "n_insert", "n_delete": "n_delete", "n_substitute": "n_substitute", "n_shift": "n_shift", "tot_shifted_words": "tot_shifted_words", "tot_edits": "tot_edits", "hter": "hter", "cer": "cer", "bleu": "bleu", "chrf": "chrf", "time_s": "time_s", "time_m": "time_m", "time_h": "time_h", "time_per_char": "time_per_char", "time_per_word": "time_per_word", "key_per_char": "key_per_char", "words_per_hour": "words_per_hour", "words_per_minute": "words_per_minute", "per_subject_visit_order": "per_subject_visit_order", "src_text": "src_text", "mt_text": "mt_text", "tgt_text": "tgt_text", "aligned_edit": "aligned_edit", "src_tokens": "src_tokens", "src_annotations.lemma": "src_annotations_lemma", "src_annotations.upos": "src_annotations_upos", "src_annotations.feats": "src_annotations_feats", "src_annotations.head": "src_annotations_head", "src_annotations.deprel": "src_annotations_deprel", "src_annotations.start_char": "src_annotations_start_char", "src_annotations.end_char": "src_annotations_end_char", "src_annotations.ner": "src_annotations_ner", "mt_tokens": "mt_tokens", "mt_annotations.lemma": "mt_annotations_lemma", "mt_annotations.upos": "mt_annotations_upos", "mt_annotations.feats": "mt_annotations_feats", "mt_annotations.head": "mt_annotations_head", "mt_annotations.deprel": "mt_annotations_deprel", "mt_annotations.start_char": "mt_annotations_start_char", "mt_annotations.end_char": "mt_annotations_end_char", "mt_annotations.ner": "mt_annotations_ner", "tgt_tokens": "tgt_tokens", "tgt_annotations.lemma": "tgt_annotations_lemma", "tgt_annotations.upos": "tgt_annotations_upos", "tgt_annotations.feats": "tgt_annotations_feats", "tgt_annotations.head": "tgt_annotations_head", "tgt_annotations.deprel": "tgt_annotations_deprel", "tgt_annotations.start_char": "tgt_annotations_start_char", "tgt_annotations.end_char": "tgt_annotations_end_char", "tgt_annotations.ner": "tgt_annotations_ner", "src_wmt22_qe": "src_wmt22_qe", "mt_wmt22_qe": "mt_wmt22_qe"}, "dataset_description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.\n", "dataset_name": "GroNLP/divemt"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:it", "language:vi", "language:nl", "language:uk", "language:tr", "language:ar"], "is_gated": false}, "mteb/amazon_reviews_multi": {"dataset_name": "mteb/amazon_reviews_multi", "description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.", "downloads": 2978, "configs": {"all_languages": {"config_name": "all_languages", "sample_row": "{\"id\": \"\\\"de_0203609\\\"\", \"text\": \"\\\"Leider nach 1 Jahr kaputt\\\\n\\\\nArmband ist leider n...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"de_0203609\\\"\", \"text\": \"\\\"Leider nach 1 Jahr kaputt\\\\n\\\\nArmband ist leider n...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"en_0964290\\\"\", \"text\": \"\\\"I'll spend twice the amount of time boxing up the...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"es_0491108\\\"\", \"text\": \"\\\"television Nevir\\\\n\\\\nNada bueno se me fue ka panta...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"fr_0424335\\\"\", \"text\": \"\\\"Brumisateur \\\\u00e0 pompe\\\\n\\\\nA d\\\\u00e9conseiller -...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"\\\"ja_0388536\\\"\", \"text\": \"\\\"\\\\u672c\\\\u9769\\\\u3067\\\\u3082\\\\u9632\\\\u6c34\\\\u3067\\\\u3082\\\\...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"\\\"zh_0626061\\\"\", \"text\": \"\\\"\\\\u6b64\\\\u4e66\\\\u4e0d\\\\u662f\\\\u672c\\\\u4eba\\\\u8d2d\\\\u4e70\\\\...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}}, "tags": ["language:de", "language:en", "language:es", "language:fr", "language:ja", "language:zh"], "is_gated": false}, "silver/lccc": {"dataset_name": "silver/lccc", "description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.", "downloads": 14, "configs": {"large": {"config_name": "large", "sample_row": "{\"dialog\": \"[\\\"\\\\u706b\\\\u9505 \\\\u6211 \\\\u5728 \\\\u91cd\\\\u5e86 \\\\u6210\\\\u...\"}", "columns": ["dialog"], "columns_mapping": {"dialog": "dialog"}, "dataset_description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.\n", "dataset_name": "silver/lccc"}, "base": {"config_name": "base", "sample_row": "{\"dialog\": \"[\\\"\\\\u4f60 \\\\u53bb \\\\u90a3\\\\u513f \\\\u7adf\\\\u7136 \\\\u4e0d\\\\u...\"}", "columns": ["dialog"], "columns_mapping": {"dialog": "dialog"}, "dataset_description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.\n", "dataset_name": "silver/lccc"}}, "tags": ["task_categories:conversational", "task_ids:dialogue-generation", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:zh", "dialogue-response-retrieval"], "is_gated": false}, "enwik8": {"dataset_name": "enwik8", "description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 bytes of English Wikipedia in 2006 in XML", "downloads": 6262, "configs": {"enwik8": {"config_name": "enwik8", "sample_row": "{\"text\": \"\\\" It is a one-person job , no problem.\\\"...\", \"paragraph_answer\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"paragraph_sentence\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"paragraph_id\": \"\\\"5dd4d824cc027a086d65fde6\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "new_wiki": {"config_name": "new_wiki", "sample_row": "{\"answer\": \"\\\"Edison Electric Illuminating Company\\\"\", \"paragraph_question\": \"\\\"question: Consolidated Edison can trace it's root...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"sentence\": \"\\\"Con Edison's electric business traces its roots b...\", \"paragraph\": \"\\\"Gas and electric service is provided by Consolida...\", \"sentence_answer\": \"\\\"Con Edison's electric business traces its roots b...\", \"paragraph_answer\": \"\\\"Gas and electric service is provided by Consolida...\", \"paragraph_sentence\": \"\\\"Gas and electric service is provided by Consolida...\", \"paragraph_id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "nyt": {"config_name": "nyt", "sample_row": "{\"answer\": \"\\\"letters\\\"\", \"paragraph_question\": \"\\\"question: Ms. Clyne used facsimiles of what posse...\", \"question\": \"\\\"Ms. Clyne used facsimiles of what possession of E...\", \"sentence\": \"\\\"This time, Ms. Clyne used facsimiles of letters b...\", \"paragraph\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"sentence_answer\": \"\\\"This time, Ms. Clyne used facsimiles of lett...\", \"paragraph_answer\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"paragraph_sentence\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"paragraph_id\": \"\\\"5d704c4ac8e4820a9b66e9f7\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "reddit": {"config_name": "reddit", "sample_row": "{\"answer\": \"\\\"pokegenning/romhacking\\\"\", \"paragraph_question\": \"\\\"question: What is the author's main reason for wa...\", \"question\": \"\\\"What is the author's main reason for wanting to h...\", \"sentence\": \"\\\"My main reason for wanting to hax is pokegenning/...\", \"paragraph\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"sentence_answer\": \"\\\"My main reason for wanting to hax is pokegen...\", \"paragraph_answer\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"paragraph_sentence\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"paragraph_id\": \"\\\"5d9c25298ae5305bc982eff7\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:subjqa", "language:en", "question-generation"], "is_gated": false}, "lmqg/qg_esquad": {"dataset_name": "lmqg/qg_esquad", "description": "[SQuAD-es](https://huggingface.co/datasets/squad_es) dataset for question generation (QG) task.", "downloads": 101, "configs": {"qg_esquad": {"config_name": "qg_esquad", "sample_row": "{\"answer\": \"\\\"8.491.079\\\"\", \"paragraph_question\": \"\\\"question: \\\\u00bfCu\\\\u00e1l es la poblaci\\\\u00f3n de...\", \"question\": \"\\\"\\\\u00bfCu\\\\u00e1l es la poblaci\\\\u00f3n de Nueva Yor...\", \"sentence\": \"\\\"Con una poblaci\\\\u00f3n censada estimada en 2014 d...\", \"paragraph\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"sentence_answer\": \"\\\"Con una poblaci\\\\u00f3n censada estimada en 2014 d...\", \"paragraph_answer\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"paragraph_sentence\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"paragraph_id\": \"\\\"56cf9d81234ae51400d9be1e\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD-es](https://huggingface.co/datasets/squad_es) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_esquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:es", "question-generation"], "is_gated": false}, "lmqg/qg_koquad": {"dataset_name": "lmqg/qg_koquad", "description": "[KorQuAD](https://huggingface.co/datasets/squad_kor_v1) dataset for question generation (QG) task.", "downloads": 97, "configs": {"qg_koquad": {"config_name": "qg_koquad", "sample_row": "{\"answer\": \"\\\"\\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8\\\"\", \"paragraph_question\": \"\\\"question: \\\\uc5ed\\\\uc0ac\\\\uc5d0\\\\uc11c \\\\uc784\\\\uae08\\\\u...\", \"question\": \"\\\"\\\\uc5ed\\\\uc0ac\\\\uc5d0\\\\uc11c \\\\uc784\\\\uae08\\\\uc758 \\\\uc5b...\", \"sentence\": \"\\\"\\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8 \\\\uc758 \\\\uc5...\", \"paragraph\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"sentence_answer\": \"\\\" \\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8 \\\\...\", \"paragraph_answer\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"paragraph_sentence\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"paragraph_id\": \"\\\"6343803-2-2\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[KorQuAD](https://huggingface.co/datasets/squad_kor_v1) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_koquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:ko", "question-generation"], "is_gated": false}, "lmqg/qg_ruquad": {"dataset_name": "lmqg/qg_ruquad", "description": "[SberSQuAD](https://huggingface.co/datasets/sberquad) dataset for question generation (QG) task.", "downloads": 92, "configs": {"qg_ruquad": {"config_name": "qg_ruquad", "sample_row": "{\"answer\": \"\\\"1,1 \\\\u043c/\\\\u0441\\\"\", \"paragraph_question\": \"\\\"question: \\\\u0447\\\\u0435\\\\u043c \\\\u0441\\\\u043e\\\\u043e\\\\u...\", \"question\": \"\\\"\\\\u0447\\\\u0435\\\\u043c \\\\u0441\\\\u043e\\\\u043e\\\\u0442\\\\u0432...\", \"sentence\": \"\\\"\\\\u0412 1975 \\\\u0433\\\\u043e\\\\u0434\\\\u0443 XV \\\\u0413\\\\u0...\", \"paragraph\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"sentence_answer\": \"\\\"\\\\u0412 1975 \\\\u0433\\\\u043e\\\\u0434\\\\u0443 XV \\\\u0413\\\\u0...\", \"paragraph_answer\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"paragraph_sentence\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"paragraph_id\": \"\\\"2978\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SberSQuAD](https://huggingface.co/datasets/sberquad) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_ruquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:deepset/germanquad", "language:ru", "question-generation"], "is_gated": false}, "acronym_identification": {"dataset_name": "acronym_identification", "description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.", "downloads": 1873, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"TR-0\\\"\", \"tokens\": \"[\\\"What\\\", \\\"is\\\", \\\"here\\\", \\\"called\\\", \\\"controlled\\\", \\\"na...\", \"labels\": \"[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4...\"}", "columns": ["id", "tokens", "labels"], "columns_mapping": {"id": "id", "tokens": "tokens", "labels": "labels"}, "dataset_description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "dataset_name": "acronym_identification"}}, "tags": ["task_categories:token-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "acronym-identification"], "is_gated": false}, "ade_corpus_v2": {"dataset_name": "ade_corpus_v2", "description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.", "downloads": 2423, "configs": {"Ade_corpus_v2_classification": {"config_name": "Ade_corpus_v2_classification", "sample_row": "{\"text\": \"\\\"Intravenous azithromycin-induced ototoxicity.\\\"\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}, "Ade_corpus_v2_drug_ade_relation": {"config_name": "Ade_corpus_v2_drug_ade_relation", "sample_row": "{\"text\": \"\\\"Intravenous azithromycin-induced ototoxicity.\\\"\", \"drug\": \"\\\"azithromycin\\\"\", \"effect\": \"\\\"ototoxicity\\\"\", \"indexes.drug.start_char\": \"[12]\", \"indexes.drug.end_char\": \"[24]\", \"indexes.effect.start_char\": \"[33]\", \"indexes.effect.end_char\": \"[44]\"}", "columns": ["text", "drug", "effect", "indexes_drug_start_char", "indexes_drug_end_char", "indexes_effect_start_char", "indexes_effect_end_char"], "columns_mapping": {"text": "text", "drug": "drug", "effect": "effect", "indexes.drug.start_char": "indexes_drug_start_char", "indexes.drug.end_char": "indexes_drug_end_char", "indexes.effect.start_char": "indexes_effect_start_char", "indexes.effect.end_char": "indexes_effect_end_char"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}, "Ade_corpus_v2_drug_dosage_relation": {"config_name": "Ade_corpus_v2_drug_dosage_relation", "sample_row": "{\"text\": \"\\\"An episode of subacute encephalopathy after the i...\", \"drug\": \"\\\"methotrexate\\\"\", \"dosage\": \"\\\"1500 mg/m2\\\"\", \"indexes.drug.start_char\": \"[79]\", \"indexes.drug.end_char\": \"[91]\", \"indexes.dosage.start_char\": \"[93]\", \"indexes.dosage.end_char\": \"[103]\"}", "columns": ["text", "drug", "dosage", "indexes_drug_start_char", "indexes_drug_end_char", "indexes_dosage_start_char", "indexes_dosage_end_char"], "columns_mapping": {"text": "text", "drug": "drug", "dosage": "dosage", "indexes.drug.start_char": "indexes_drug_start_char", "indexes.drug.end_char": "indexes_drug_end_char", "indexes.dosage.start_char": "indexes_dosage_start_char", "indexes.dosage.end_char": "indexes_dosage_end_char"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}}, "tags": ["task_categories:text-classification", "task_categories:token-classification", "task_ids:coreference-resolution", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "adversarial_qa": {"dataset_name": "adversarial_qa", "description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.", "downloads": 5708, "configs": {"adversarialQA": {"config_name": "adversarialQA", "sample_row": "{\"id\": \"\\\"7ba1e8f4261d3170fcf42e84a81dd749116fae95\\\"\", \"title\": \"\\\"Brain\\\"\", \"context\": \"\\\"Another approach to brain function is to examine ...\", \"question\": \"\\\"What sare the benifts of the blood brain barrir?\\\"...\", \"answers.text\": \"[\\\"isolated from the bloodstream\\\"]\", \"answers.answer_start\": \"[195]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"Combined\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "dbidaf": {"config_name": "dbidaf", "sample_row": "{\"id\": \"\\\"821607441c173838196c4d1500c2ab21a044e6b0\\\"\", \"title\": \"\\\"Yale_University\\\"\", \"context\": \"\\\"Slack (2003) compares three groups that conducted...\", \"question\": \"\\\"what year were the research groups compared\\\"\", \"answers.text\": \"[\\\"2003\\\"]\", \"answers.answer_start\": \"[7]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"BiDAF\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "dbert": {"config_name": "dbert", "sample_row": "{\"id\": \"\\\"dab017ed8a1c27c6afa2d8618abc3a477a4edffc\\\"\", \"title\": \"\\\"Empiricism\\\"\", \"context\": \"\\\"A generation later, the Irish Anglican bishop, Ge...\", \"question\": \"\\\"what concept is mentioned last?\\\"\", \"answers.text\": \"[\\\"subjective idealism\\\"]\", \"answers.answer_start\": \"[742]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"BERT-Large\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "droberta": {"config_name": "droberta", "sample_row": "{\"id\": \"\\\"12cf36866b656dc4f254081fe6796ea1be2f6d43\\\"\", \"title\": \"\\\"Napoleon\\\"\", \"context\": \"\\\"When he became First Consul and later Emperor, Na...\", \"question\": \"\\\"What jewelry like accessories did he wear?\\\"\", \"answers.text\": \"[\\\"L\\\\u00e9gion d'honneur star, medal and ribbon, an...\", \"answers.answer_start\": \"[462]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"RoBERTa-Large\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "aeslc": {"dataset_name": "aeslc", "description": "A collection of email messages of employees in the Enron Corporation.\n\nThere are two features:\n - email_body: email body text.\n - subject_line: email subject text.", "downloads": 1738, "configs": {"default": {"config_name": "default", "sample_row": "{\"email_body\": \"\\\"Greg/Phillip, Attached is the Grande Communicati...\", \"subject_line\": \"\\\"Service Agreement\\\"\"}", "columns": ["email_body", "subject_line"], "columns_mapping": {"email_body": "email_body", "subject_line": "subject_line"}, "dataset_description": "\nA collection of email messages of employees in the Enron Corporation.\n\nThere are two features:\n - email_body: email body text.\n - subject_line: email subject text.\n", "dataset_name": "aeslc"}}, "tags": ["task_categories:summarization", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "aspect-based-summarization", "conversations-summarization", "multi-document-summarization", "email-headline-generation"], "is_gated": false}, "afrikaans_ner_corpus": {"dataset_name": "afrikaans_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "downloads": 505, "configs": {"afrikaans_ner_corpus": {"config_name": "afrikaans_ner_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Vertaling\\\", \\\"van\\\", \\\"die\\\", \\\"inligting\\\", \\\"in\\\", \\\"di...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.\n", "dataset_name": "afrikaans_ner_corpus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:af"], "is_gated": false}, "ag_news": {"dataset_name": "ag_news", "description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).", "downloads": 29760, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Wall St. Bears Claw Back Into the Black (Reuters)...\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).\n", "dataset_name": "ag_news"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ai2_arc": {"dataset_name": "ai2_arc", "description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.", "downloads": 272398, "configs": {"ARC-Challenge": {"config_name": "ARC-Challenge", "sample_row": "{\"id\": \"\\\"Mercury_SC_415702\\\"\", \"question\": \"\\\"George wants to warm his hands quickly by rubbing...\", \"choices.text\": \"[\\\"dry palms\\\", \\\"wet palms\\\", \\\"palms covered with oil...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"A\\\"\"}", "columns": ["id", "question", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "dataset_name": "ai2_arc"}, "ARC-Easy": {"config_name": "ARC-Easy", "sample_row": "{\"id\": \"\\\"Mercury_7220990\\\"\", \"question\": \"\\\"Which factor will most likely cause a person to d...\", \"choices.text\": \"[\\\"a leg muscle relaxing after exercise\\\", \\\"a bacter...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"B\\\"\"}", "columns": ["id", "question", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "dataset_name": "ai2_arc"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:multiple-choice-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ajgt_twitter_ar": {"dataset_name": "ajgt_twitter_ar", "description": "Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.", "downloads": 541, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\" \\\\u0627\\\\u0631\\\\u0628\\\\u062f \\\\u0641\\\\u064a\\\\u0647\\\\u062...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.\n", "dataset_name": "ajgt_twitter_ar"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "allegro_reviews": {"dataset_name": "allegro_reviews", "description": "Allegro Reviews is a sentiment analysis dataset, consisting of 11,588 product reviews written in Polish and extracted\nfrom Allegro.pl - a popular e-commerce marketplace. Each review contains at least 50 words and has a rating on a scale\nfrom one (negative review) to five (positive review).\n\nWe recommend using the provided train/dev/test split. The ratings for the test set reviews are kept hidden.\nYou can evaluate your model using the online evaluation tool available on klejbenchmark.com.", "downloads": 461, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Jako do ceny dobra. Przyssawka mog\\\\u0142aby by\\\\u0...\", \"rating\": \"3.0\"}", "columns": ["text", "rating"], "columns_mapping": {"text": "text", "rating": "rating"}, "dataset_description": "Allegro Reviews is a sentiment analysis dataset, consisting of 11,588 product reviews written in Polish and extracted\nfrom Allegro.pl - a popular e-commerce marketplace. Each review contains at least 50 words and has a rating on a scale\nfrom one (negative review) to five (positive review).\n\nWe recommend using the provided train/dev/test split. The ratings for the test set reviews are kept hidden.\nYou can evaluate your model using the online evaluation tool available on klejbenchmark.com.\n", "dataset_name": "allegro_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-scoring", "task_ids:text-scoring", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "amazon_polarity": {"dataset_name": "amazon_polarity", "description": "The Amazon reviews dataset consists of reviews from amazon.\nThe data span a period of 18 years, including ~35 million reviews up to March 2013.\nReviews include product and user information, ratings, and a plaintext review.", "downloads": 11831, "configs": {"amazon_polarity": {"config_name": "amazon_polarity", "sample_row": "{\"label\": \"1\", \"title\": \"\\\"Stuning even for the non-gamer\\\"\", \"content\": \"\\\"This sound track was beautiful! It paints the sen...\"}", "columns": ["label", "title", "content"], "columns_mapping": {"label": "label", "title": "title", "content": "content"}, "dataset_description": "The Amazon reviews dataset consists of reviews from amazon.\nThe data span a period of 18 years, including ~35 million reviews up to March 2013.\nReviews include product and user information, ratings, and a plaintext review.\n", "dataset_name": "amazon_polarity"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ambig_qa": {"dataset_name": "ambig_qa", "description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.", "downloads": 832, "configs": {"light": {"config_name": "light", "sample_row": "{\"id\": \"\\\"-4469503464110108318\\\"\", \"question\": \"\\\"When did the simpsons first air on television?\\\"...\", \"annotations.type\": \"[\\\"multipleQAs\\\"]\", \"annotations.answer\": \"[[]]\", \"annotations.qaPairs\": \"[{\\\"question\\\": [\\\"When did the Simpsons first air on...\"}", "columns": ["id", "question", "annotations_type", "annotations_answer", "annotations_qaPairs"], "columns_mapping": {"id": "id", "question": "question", "annotations.type": "annotations_type", "annotations.answer": "annotations_answer", "annotations.qaPairs": "annotations_qaPairs"}, "dataset_description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.\n", "dataset_name": "ambig_qa"}, "full": {"config_name": "full", "sample_row": "{\"id\": \"\\\"-4469503464110108318\\\"\", \"question\": \"\\\"When did the simpsons first air on television?\\\"...\", \"annotations.type\": \"[\\\"multipleQAs\\\"]\", \"annotations.answer\": \"[[]]\", \"annotations.qaPairs\": \"[{\\\"question\\\": [\\\"When did the Simpsons first air on...\", \"viewed_doc_titles\": \"[\\\"The Simpsons\\\"]\", \"used_queries.query\": \"[\\\"When did the simpsons first air on television?\\\"]...\", \"used_queries.results\": \"[{\\\"title\\\": [\\\"History of The Simpsons\\\", \\\"The Simpso...\", \"nq_answer\": \"[\\\"December 17 , 1989\\\"]\", \"nq_doc_title\": \"\\\"The Simpsons\\\"\"}", "columns": ["id", "question", "annotations_type", "annotations_answer", "annotations_qaPairs", "viewed_doc_titles", "used_queries_query", "used_queries_results", "nq_answer", "nq_doc_title"], "columns_mapping": {"id": "id", "question": "question", "annotations.type": "annotations_type", "annotations.answer": "annotations_answer", "annotations.qaPairs": "annotations_qaPairs", "viewed_doc_titles": "viewed_doc_titles", "used_queries.query": "used_queries_query", "used_queries.results": "used_queries_results", "nq_answer": "nq_answer", "nq_doc_title": "nq_doc_title"}, "dataset_description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.\n", "dataset_name": "ambig_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "source_datasets:original", "language:en"], "is_gated": false}, "amttl": {"dataset_name": "amttl", "description": "Chinese word segmentation (CWS) trained from open source corpus faces dramatic performance drop\nwhen dealing with domain text, especially for a domain with lots of special terms and diverse\nwriting styles, such as the biomedical domain. However, building domain-specific CWS requires\nextremely high annotation cost. In this paper, we propose an approach by exploiting domain-invariant\nknowledge from high resource to low resource domains. Extensive experiments show that our mode\nachieves consistently higher accuracy than the single-task CWS and other transfer learning\nbaselines, especially when there is a large disparity between source and target domains.\n\nThis dataset is the accompanied medical Chinese word segmentation (CWS) dataset.\nThe tags are in BIES scheme.\n\nFor more details see https://www.aclweb.org/anthology/C18-1307/", "downloads": 358, "configs": {"amttl": {"config_name": "amttl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5341\\\", \\\"\\\\u5e74\\\", \\\"\\\\u524d\\\", \\\"\\\\u5f97\\\", \\\"\\\\u7684\\\",...\", \"tags\": \"[0, 2, 3, 3, 3, 0, 2, 3, 0, 2, 0, 2, 0, 2, 0, 2, 3...\"}", "columns": ["id", "tokens", "tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "tags": "tags"}, "dataset_description": "Chinese word segmentation (CWS) trained from open source corpus faces dramatic performance drop\nwhen dealing with domain text, especially for a domain with lots of special terms and diverse\nwriting styles, such as the biomedical domain. However, building domain-specific CWS requires\nextremely high annotation cost. In this paper, we propose an approach by exploiting domain-invariant\nknowledge from high resource to low resource domains. Extensive experiments show that our mode\nachieves consistently higher accuracy than the single-task CWS and other transfer learning\nbaselines, especially when there is a large disparity between source and target domains.\n\nThis dataset is the accompanied medical Chinese word segmentation (CWS) dataset.\nThe tags are in BIES scheme.\n\nFor more details see https://www.aclweb.org/anthology/C18-1307/\n", "dataset_name": "amttl"}}, "tags": ["task_categories:token-classification", "task_ids:parsing", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "app_reviews": {"dataset_name": "app_reviews", "description": "It is a large dataset of Android applications belonging to 23 differentapps categories, which provides an overview of the types of feedback users report on the apps and documents the evolution of the related code metrics. The dataset contains about 395 applications of the F-Droid repository, including around 600 versions, 280,000 user reviews (extracted with specific text mining approaches)", "downloads": 3350, "configs": {"default": {"config_name": "default", "sample_row": "{\"package_name\": \"\\\"com.mantz_it.rfanalyzer\\\"\", \"review\": \"\\\"Great app! The new version now works on my Bravia...\", \"date\": \"\\\"October 12 2016\\\"\", \"star\": \"4\"}", "columns": ["package_name", "review", "date", "star"], "columns_mapping": {"package_name": "package_name", "review": "review", "date": "date", "star": "star"}, "dataset_description": "It is a large dataset of Android applications belonging to 23 differentapps categories, which provides an overview of the types of feedback users report on the apps and documents the evolution of the related code metrics. The dataset contains about 395 applications of the F-Droid repository, including around 600 versions, 280,000 user reviews (extracted with specific text mining approaches)\n", "dataset_name": "app_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:sentiment-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bigIR/ar_cov19": {"dataset_name": "bigIR/ar_cov19", "description": "ArCOV-19 is an Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 30th of April 2020. ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing, among others", "downloads": 345, "configs": {"ar_cov19": {"config_name": "ar_cov19", "sample_row": "{\"tweetID\": \"\\\"1221583597573824515\\\"\"}", "columns": ["tweetID"], "columns_mapping": {"tweetID": "tweetID"}, "dataset_description": "ArCOV-19 is an Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 30th of April 2020. ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing, among others\n", "dataset_name": "bigIR/ar_cov19"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:ar", "data-mining"], "is_gated": false}, "ar_res_reviews": {"dataset_name": "ar_res_reviews", "description": "Dataset of 8364 restaurant reviews scrapped from qaym.com in Arabic for sentiment analysis", "downloads": 377, "configs": {"default": {"config_name": "default", "sample_row": "{\"polarity\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0648\\\\u0644\\\\u0627: \\\\u0627\\\\u0644\\\\u0645\\\\u064...\", \"restaurant_id\": \"\\\"296\\\"\", \"user_id\": \"\\\"423\\\"\"}", "columns": ["polarity", "text", "restaurant_id", "user_id"], "columns_mapping": {"polarity": "polarity", "text": "text", "restaurant_id": "restaurant_id", "user_id": "user_id"}, "dataset_description": "Dataset of 8364 restaurant reviews scrapped from qaym.com in Arabic for sentiment analysis\n", "dataset_name": "ar_res_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "ar_sarcasm": {"dataset_name": "ar_sarcasm", "description": "ArSarcasm is a new Arabic sarcasm detection dataset.\nThe dataset was created using previously available Arabic sentiment analysis datasets (SemEval 2017 and ASTD)\n and adds sarcasm and dialect labels to them. The dataset contains 10,547 tweets, 1,682 (16%) of which are sarcastic.", "downloads": 417, "configs": {"default": {"config_name": "default", "sample_row": "{\"dialect\": \"1\", \"sarcasm\": \"0\", \"sentiment\": \"0\", \"original_sentiment\": \"0\", \"tweet\": \"\\\"\\\\u0646\\\\u0635\\\\u064a\\\\u062d\\\\u0647 \\\\u0645\\\\u0627 \\\\u063...\", \"source\": \"\\\"semeval\\\"\"}", "columns": ["dialect", "sarcasm", "sentiment", "original_sentiment", "tweet", "source"], "columns_mapping": {"dialect": "dialect", "sarcasm": "sarcasm", "sentiment": "sentiment", "original_sentiment": "original_sentiment", "tweet": "tweet", "source": "source"}, "dataset_description": "ArSarcasm is a new Arabic sarcasm detection dataset.\nThe dataset was created using previously available Arabic sentiment analysis datasets (SemEval 2017 and ASTD)\n and adds sarcasm and dialect labels to them. The dataset contains 10,547 tweets, 1,682 (16%) of which are sarcastic.\n", "dataset_name": "ar_sarcasm"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended|other-semeval_2017", "source_datasets:extended|other-astd", "language:ar", "sarcasm-detection"], "is_gated": false}, "arabic_pos_dialect": {"dataset_name": "arabic_pos_dialect", "description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.", "downloads": 878, "configs": {"egy": {"config_name": "egy", "sample_row": "{\"fold\": \"4\", \"subfold\": \"\\\"A\\\"\", \"words\": \"[\\\"\\\\u0644\\\\u064a\\\\u0647\\\", \\\"\\\\u0644\\\\u0645\\\\u0627\\\", \\\"\\\\u06...\", \"segments\": \"[\\\"\\\\u0644\\\\u064a\\\\u0647\\\", \\\"\\\\u0644\\\\u0645\\\\u0627\\\", \\\"\\\\u06...\", \"pos_tags\": \"[\\\"PART\\\", \\\"PART\\\", \\\"V\\\", \\\"NOUN\\\", \\\"PREP\\\", \\\"NOUN+PRON\\\",...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "lev": {"config_name": "lev", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"\\\\u0642\\\\u0627\\\\u0639\\\\u062f\\\", \\\"\\\\u0639\\\\u0645\\\", \\\"\\\\u06...\", \"segments\": \"[\\\"\\\\u0642\\\\u0627\\\\u0639\\\\u062f\\\", \\\"\\\\u0639\\\\u0645\\\", \\\"\\\\u06...\", \"pos_tags\": \"[\\\"ADJ\\\", \\\"PART\\\", \\\"V\\\", \\\"NOUN+PRON\\\", \\\"PREP+DET+ADJ\\\", ...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "glf": {"config_name": "glf", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"@tagimlm77\\\", \\\"@444Tf\\\", \\\"\\\\u0647\\\\u0648\\\", \\\"\\\\u062b\\\\u...\", \"segments\": \"[\\\"@tagimlm77\\\", \\\"@444Tf\\\", \\\"\\\\u0647\\\\u0648\\\", \\\"\\\\u062b\\\\u...\", \"pos_tags\": \"[\\\"MENTION\\\", \\\"MENTION\\\", \\\"PRON\\\", \\\"NOUN\\\", \\\"ADJ\\\", \\\"NOU...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "mgr": {"config_name": "mgr", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"0.7\\\", \\\"\\\\u062f\\\\u064a\\\\u0627\\\\u0644\\\", \\\"\\\\u0627\\\\u0644\\\\...\", \"segments\": \"[\\\"0.7\\\", \\\"\\\\u062f\\\\u064a\\\\u0627\\\\u0644\\\", \\\"\\\\u0627\\\\u0644+...\", \"pos_tags\": \"[\\\"NUM\\\", \\\"PREP\\\", \\\"DET+NOUN+NSUFF\\\", \\\"PART\\\", \\\"V+PRON\\\"...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended", "language:ar"], "is_gated": false}, "arcd": {"dataset_name": "arcd", "description": " Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles.", "downloads": 497, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"969331847966\\\"\", \"title\": \"\\\"\\\\u062c\\\\u0645\\\\u0627\\\\u0644 \\\\u062e\\\\u0627\\\\u0634\\\\u0642...\", \"context\": \"\\\"\\\\u062c\\\\u0645\\\\u0627\\\\u0644 \\\\u0623\\\\u062d\\\\u0645\\\\u062f...\", \"question\": \"\\\"- \\\\u0645\\\\u0646 \\\\u0647\\\\u0648 \\\\u062c\\\\u0645\\\\u0627\\\\u0...\", \"answers.text\": \"[\\\"\\\\u0635\\\\u062d\\\\u0641\\\\u064a \\\\u0648\\\\u0625\\\\u0639\\\\u064...\", \"answers.answer_start\": \"[73]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": " Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles.\n", "dataset_name": "arcd"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "art": {"dataset_name": "art", "description": "the Abductive Natural Language Inference Dataset from AI2", "downloads": 541, "configs": {"anli": {"config_name": "anli", "sample_row": "{\"observation_1\": \"\\\"Chad went to get the wheel alignment measured on ...\", \"observation_2\": \"\\\"The mechanic provided a working alignment with ne...\", \"hypothesis_1\": \"\\\"Chad was waiting for his car to be washed.\\\"\", \"hypothesis_2\": \"\\\"Chad was waiting for his car to be finished.\\\"\", \"label\": \"2\"}", "columns": ["observation_1", "observation_2", "hypothesis_1", "hypothesis_2", "label"], "columns_mapping": {"observation_1": "observation_1", "observation_2": "observation_2", "hypothesis_1": "hypothesis_1", "hypothesis_2": "hypothesis_2", "label": "label"}, "dataset_description": "the Abductive Natural Language Inference Dataset from AI2\n", "dataset_name": "art"}}, "tags": ["task_categories:multiple-choice", "task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "abductive-natural-language-inference"], "is_gated": false}, "ascent_kb": {"dataset_name": "ascent_kb", "description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).", "downloads": 513, "configs": {"canonical": {"config_name": "canonical", "sample_row": "{\"arg1\": \"\\\"aa\\\"\", \"rel\": \"\\\"/r/HasProperty\\\"\", \"arg2\": \"\\\"immunohistochemistry staining\\\"\", \"support\": \"1\", \"facets\": \"[]\", \"source_sentences\": \"[{\\\"text\\\": \\\"AA can be identified by immunohistochem...\"}", "columns": ["arg1", "rel", "arg2", "support", "facets", "source_sentences"], "columns_mapping": {"arg1": "arg1", "rel": "rel", "arg2": "arg2", "support": "support", "facets": "facets", "source_sentences": "source_sentences"}, "dataset_description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).\n", "dataset_name": "ascent_kb"}, "open": {"config_name": "open", "sample_row": "{\"subject\": \"\\\"aa\\\"\", \"predicate\": \"\\\"be identified by\\\"\", \"object\": \"\\\"immunohistochemistry staining\\\"\", \"support\": \"1\", \"facets\": \"[]\", \"source_sentences\": \"[{\\\"text\\\": \\\"AA can be identified by immunohistochem...\"}", "columns": ["subject", "predicate", "object", "support", "facets", "source_sentences"], "columns_mapping": {"subject": "subject", "predicate": "predicate", "object": "object", "support": "support", "facets": "facets", "source_sentences": "source_sentences"}, "dataset_description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).\n", "dataset_name": "ascent_kb"}}, "tags": ["task_categories:other", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base"], "is_gated": false}, "aslg_pc12": {"dataset_name": "aslg_pc12", "description": "A large synthetic collection of parallel English and ASL-Gloss texts.\nThere are two string features: text, and gloss.", "downloads": 457, "configs": {"default": {"config_name": "default", "sample_row": "{\"gloss\": \"\\\"\\\\ufeffMEMBERSHIP PARLIAMENT SEE MINUTE\\\\n\\\"\", \"text\": \"\\\"\\\\ufeffmembership of parliament see minutes\\\\n\\\"\"}", "columns": ["gloss", "text"], "columns_mapping": {"gloss": "gloss", "text": "text"}, "dataset_description": "A large synthetic collection of parallel English and ASL-Gloss texts.\nThere are two string features: text, and gloss.\n", "dataset_name": "aslg_pc12"}}, "tags": ["task_categories:translation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:ase", "language:en"], "is_gated": false}, "asnq": {"dataset_name": "asnq", "description": "ASNQ is a dataset for answer sentence selection derived from\nGoogle's Natural Questions (NQ) dataset (Kwiatkowski et al. 2019).\n\nEach example contains a question, candidate sentence, label indicating whether or not\nthe sentence answers the question, and two additional features --\nsentence_in_long_answer and short_answer_in_sentence indicating whether ot not the\ncandidate sentence is contained in the long_answer and if the short_answer is in the candidate sentence.\n\nFor more details please see\nhttps://arxiv.org/pdf/1911.04118.pdf\n\nand\n\nhttps://research.google/pubs/pub47761/", "downloads": 409, "configs": {"default": {"config_name": "default", "sample_row": "{\"question\": \"\\\"what is the use of fn key in mac\\\"\", \"sentence\": \"\\\"It is typically found on laptops due to their key...\", \"label\": \"0\", \"sentence_in_long_answer\": \"false\", \"short_answer_in_sentence\": \"false\"}", "columns": ["question", "sentence", "label", "sentence_in_long_answer", "short_answer_in_sentence"], "columns_mapping": {"question": "question", "sentence": "sentence", "label": "label", "sentence_in_long_answer": "sentence_in_long_answer", "short_answer_in_sentence": "short_answer_in_sentence"}, "dataset_description": "ASNQ is a dataset for answer sentence selection derived from\nGoogle's Natural Questions (NQ) dataset (Kwiatkowski et al. 2019).\n\nEach example contains a question, candidate sentence, label indicating whether or not\nthe sentence answers the question, and two additional features --\nsentence_in_long_answer and short_answer_in_sentence indicating whether ot not the\ncandidate sentence is contained in the long_answer and if the short_answer is in the candidate sentence.\n\nFor more details please see\nhttps://arxiv.org/pdf/1911.04118.pdf\n\nand\n\nhttps://research.google/pubs/pub47761/\n", "dataset_name": "asnq"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "language:en"], "is_gated": false}, "assin": {"dataset_name": "assin", "description": "The ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.", "downloads": 869, "configs": {"full": {"config_name": "full", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"A gente faz o aporte financeiro, \\\\u00e9 como se a...\", \"hypothesis\": \"\\\"Fernando Moraes afirma que n\\\\u00e3o tem v\\\\u00ednc...\", \"relatedness_score\": \"2.0\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}, "ptpt": {"config_name": "ptpt", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"Relembre-se que o atleta estava afastado dos relv...\", \"hypothesis\": \"\\\"Andr\\\\u00e9 Gomes entra em campo quatro meses depo...\", \"relatedness_score\": \"3.5\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}, "ptbr": {"config_name": "ptbr", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"A gente faz o aporte financeiro, \\\\u00e9 como se a...\", \"hypothesis\": \"\\\"Fernando Moraes afirma que n\\\\u00e3o tem v\\\\u00ednc...\", \"relatedness_score\": \"2.0\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "assin2": {"dataset_name": "assin2", "description": "The ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.\nThe training and validation data are composed, respectively, of 6,500 and 500 sentence pairs in Brazilian Portuguese,\nannotated for entailment and semantic similarity. Semantic similarity values range from 1 to 5, and text entailment\nclasses are either entailment or none. The test data are composed of approximately 3,000 sentence pairs with the same\nannotation. All data were manually annotated.", "downloads": 1522, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"Uma crian\\\\u00e7a risonha est\\\\u00e1 segurando uma ...\", \"hypothesis\": \"\\\"Uma crian\\\\u00e7a est\\\\u00e1 segurando uma pistola ...\", \"relatedness_score\": \"4.5\", \"entailment_judgment\": \"1\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.\nThe training and validation data are composed, respectively, of 6,500 and 500 sentence pairs in Brazilian Portuguese,\nannotated for entailment and semantic similarity. Semantic similarity values range from 1 to 5, and text entailment\nclasses are either entailment or none. The test data are composed of approximately 3,000 sentence pairs with the same\nannotation. All data were manually annotated.\n", "dataset_name": "assin2"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "facebook/babi_qa": {"dataset_name": "facebook/babi_qa", "description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.", "downloads": 1945, "configs": {"en-qa1": {"config_name": "en-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "hn-qa1": {"config_name": "hn-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Sita gusalkhaney mein gayi.\\\", \\\"Priya sayanakaksh...\", \"story.supporting_ids\": \"[[], [], [\\\"2\\\"], [], [], [\\\"5\\\"], [], [], [\\\"7\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"sayanakaksh\\\", \\\"\\\", \\\"\\\", \\\"rasoi ghar\\\", \\\"\\\", ...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-10k-qa1": {"config_name": "en-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-valid-qa1": {"config_name": "en-valid-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-valid-10k-qa1": {"config_name": "en-valid-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "hn-10k-qa1": {"config_name": "hn-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Sita gusalkhaney mein gayi.\\\", \\\"Priya sayanakaksh...\", \"story.supporting_ids\": \"[[], [], [\\\"2\\\"], [], [], [\\\"5\\\"], [], [], [\\\"7\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"sayanakaksh\\\", \\\"\\\", \\\"\\\", \\\"rasoi ghar\\\", \\\"\\\", ...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "shuffled-qa1": {"config_name": "shuffled-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Utxi ybnha qb qzh ptqzxbby.\\\", \\\"Hbzm jhmq qb qzh ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"ptqzxbby\\\", \\\"\\\", \\\"\\\", \\\"ztuujti\\\", \\\"\\\", \\\"\\\", \\\"z...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "shuffled-10k-qa1": {"config_name": "shuffled-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Utxi ybnha qb qzh ptqzxbby.\\\", \\\"Hbzm jhmq qb qzh ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"ptqzxbby\\\", \\\"\\\", \\\"\\\", \\\"ztuujti\\\", \\\"\\\", \\\"\\\", \\\"z...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}}, "tags": ["task_categories:question-answering", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "chained-qa"], "is_gated": false}, "banking77": {"dataset_name": "banking77", "description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.", "downloads": 5093, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"I am still waiting on my card?\\\"\", \"label\": \"11\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.\n", "dataset_name": "banking77"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bbaw_egyptian": {"dataset_name": "bbaw_egyptian", "description": "This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation\nas used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian\nHieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by\nthe project \"Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache\".", "downloads": 344, "configs": {"default": {"config_name": "default", "sample_row": "{\"transcription\": \"\\\"\\\\u2e22p\\\\u1e0f,wt-9\\\\u2e23 n =f [\\\\u2e2e\\\\u1e25tr...\", \"translation\": \"\\\"... die Neun-Bogenv\\\\u00f6lker ... zu ihm ... Pfer...\", \"hieroglyphs\": \"\\\"\\\"\"}", "columns": ["transcription", "translation", "hieroglyphs"], "columns_mapping": {"transcription": "transcription", "translation": "translation", "hieroglyphs": "hieroglyphs"}, "dataset_description": "This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation\nas used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian\nHieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by\nthe project \"Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache\".\n", "dataset_name": "bbaw_egyptian"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended|wikipedia", "language:de", "language:egy", "language:en"], "is_gated": false}, "bbc_hindi_nli": {"dataset_name": "bbc_hindi_nli", "description": "This dataset is used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.", "downloads": 380, "configs": {"bbc hindi nli": {"config_name": "bbc hindi nli", "sample_row": "{\"premise\": \"\\\"\\\\u0917\\\\u094b\\\\u092a\\\\u0928\\\\u0940\\\\u092f\\\\u0924\\\\u093e ...\", \"hypothesis\": \"\\\"\\\\u092f\\\\u0939 \\\\u0916\\\\u092c\\\\u0930 \\\\u0915\\\\u0940 \\\\u09...\", \"label\": \"1\", \"topic\": \"1\"}", "columns": ["premise", "hypothesis", "label", "topic"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "topic": "topic"}, "dataset_description": "This dataset is used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.\n", "dataset_name": "bbc_hindi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|bbc__hindi_news_classification", "language:hi"], "is_gated": false}, "bc2gm_corpus": {"dataset_name": "bc2gm_corpus", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\n\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "downloads": 670, "configs": {"bc2gm_corpus": {"config_name": "bc2gm_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Comparison\\\", \\\"with\\\", \\\"alkaline\\\", \\\"phosphatases\\\",...\", \"ner_tags\": \"[0, 0, 1, 2, 0, 1, 2, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\n\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\n", "dataset_name": "bc2gm_corpus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "best2009": {"dataset_name": "best2009", "description": "`best2009` is a Thai word-tokenization dataset from encyclopedia, novels, news and articles by\n[NECTEC](https://www.nectec.or.th/) (148,995/2,252 lines of train/test). It was created for\n[BEST 2010: Word Tokenization Competition](https://thailang.nectec.or.th/archive/indexa290.html?q=node/10).\nThe test set answers are not provided publicly.", "downloads": 335, "configs": {"best2009": {"config_name": "best2009", "sample_row": "{\"fname\": \"\\\"article_00001.txt\\\"\", \"char\": \"[\\\"\\\\u0e01\\\", \\\"\\\\u0e0e\\\", \\\"\\\\u0e2b\\\", \\\"\\\\u0e21\\\", \\\"\\\\u0e32\\\",...\", \"char_type\": \"[1, 1, 3, 1, 10, 1, 1, 4, 1, 1, 10, 1, 11, 1, 10, ...\", \"is_beginning\": \"[1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0...\"}", "columns": ["fname", "char", "char_type", "is_beginning"], "columns_mapping": {"fname": "fname", "char": "char", "char_type": "char_type", "is_beginning": "is_beginning"}, "dataset_description": "`best2009` is a Thai word-tokenization dataset from encyclopedia, novels, news and articles by\n[NECTEC](https://www.nectec.or.th/) (148,995/2,252 lines of train/test). It was created for\n[BEST 2010: Word Tokenization Competition](https://thailang.nectec.or.th/archive/indexa290.html?q=node/10).\nThe test set answers are not provided publicly.\n", "dataset_name": "best2009"}}, "tags": ["task_categories:token-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:th", "word-tokenization"], "is_gated": false}, "bianet": {"dataset_name": "bianet", "description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M", "downloads": 669, "configs": {"en_to_ku": {"config_name": "en_to_ku", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Diyarbak\\\\u0131r 2nd Criminal Court of Peace has i...\", \"translation.ku\": \"\\\"Biryara qedexekirin\\\\u00ea di r\\\\u00fbpela Lijneya ...\"}", "columns": ["id", "translation_en", "translation_ku"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ku": "translation_ku"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}, "en_to_tr": {"config_name": "en_to_tr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"The members of FEMEN stripped their tops in a sch...\", \"translation.tr\": \"\\\"FEMEN \\\\u00fcyeleri \\\\u00dcsk\\\\u00fcdar'daki bir oku...\"}", "columns": ["id", "translation_en", "translation_tr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.tr": "translation_tr"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}, "ku_to_tr": {"config_name": "ku_to_tr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ku\": \"\\\"Biryara qedexekirin\\\\u00ea di r\\\\u00fbpela Lijneya ...\", \"translation.tr\": \"\\\"Karar\\\\u0131 duyuran Radyo ve Televizyon \\\\u00dcst ...\"}", "columns": ["id", "translation_ku", "translation_tr"], "columns_mapping": {"id": "id", "translation.ku": "translation_ku", "translation.tr": "translation_tr"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:ku", "language:tr"], "is_gated": false}, "bible_para": {"dataset_name": "bible_para", "description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M", "downloads": 1193, "configs": {"de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Am Anfang schuf Gott Himmel und Erde.\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.fr\": \"\\\"Au commencement, Dieu cr\\\\u00e9a les cieux et la t...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.es\": \"\\\"En el principio cre\\\\u00f3 Dios los cielos y la ti...\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.fi\": \"\\\"Alussa loi Jumala taivaan ja maan.\\\"\"}", "columns": ["id", "translation_en", "translation_fi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-no": {"config_name": "en-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.no\": \"\\\"I begynnelsen skapte Gud himmelen og jorden.\\\"\"}", "columns": ["id", "translation_en", "translation_no"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.no": "translation_no"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-hi": {"config_name": "en-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.hi\": \"\\\"\\\\u0906\\\\u0926\\\\u093f \\\\u092e\\\\u0947\\\\u0902 \\\\u092a\\\\u093...\"}", "columns": ["id", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:acu", "language:af", "language:agr", "language:ake", "language:am", "language:amu", "language:ar", "language:bg", "language:bsn", "language:cak", "language:ceb", "language:ch", "language:chq", "language:chr", "language:cjp", "language:cni", "language:cop", "language:crp", "language:cs", "language:da", "language:de", "language:dik", "language:dje", "language:djk", "language:dop", "language:ee", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:gbi", "language:gd", "language:gu", "language:gv", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:id", "language:is", "language:it", "language:ja", "language:jak", "language:jiv", "language:kab", "language:kbh", "language:kek", "language:kn", "language:ko", "language:la", "language:lt", "language:lv", "language:mam", "language:mi", "language:ml", "language:mr", "language:my", "language:ne", "language:nhg", "language:nl", "language:no", "language:ojb", "language:pck", "language:pes", "language:pl", "language:plt", "language:pot", "language:ppk", "language:pt", "language:quc", "language:quw", "language:ro", "language:rom", "language:ru", "language:shi", "language:sk", "language:sl", "language:sn", "language:so", "language:sq", "language:sr", "language:ss", "language:sv", "language:syr", "language:te", "language:th", "language:tl", "language:tmh", "language:tr", "language:uk", "language:usp", "language:vi", "language:wal", "language:wo", "language:xh", "language:zh", "language:zu"], "is_gated": false}, "big_patent": {"dataset_name": "big_patent", "description": "BIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.", "downloads": 2904, "configs": {"all": {"config_name": "all", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n [0001] This ...\", \"abstract\": \"\\\"This invention relates to novel calcium phosphate...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "a": {"config_name": "a", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n [0001] This ...\", \"abstract\": \"\\\"This invention relates to novel calcium phosphate...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "b": {"config_name": "b", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n [0001] ...\", \"abstract\": \"\\\"A releasable fastener for an album to permit inse...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "c": {"config_name": "c", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n The present inventi...\", \"abstract\": \"\\\"The invention concerns a polypeptide selected fro...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "d": {"config_name": "d", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n This invention...\", \"abstract\": \"\\\"A method of forming fiber mixtures from different...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "e": {"config_name": "e", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n This invention broa...\", \"abstract\": \"\\\"A method and apparatus for achieving adiabatic he...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "f": {"config_name": "f", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n The present inventi...\", \"abstract\": \"\\\"A range for a recreational vehicle which is adapt...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "g": {"config_name": "g", "sample_row": "{\"description\": \"\\\"CROSS-REFERENCE TO RELATED APPLICATIONS \\\\n ...\", \"abstract\": \"\\\"Methods and systems are provided for obtaining in...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "h": {"config_name": "h", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n 1. Field of th...\", \"abstract\": \"\\\"A field programmable gate array (FPGA) with pass ...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "y": {"config_name": "y", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n This invention rela...\", \"abstract\": \"\\\"A camouflage wrapping strip that takes the form o...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}}, "tags": ["task_categories:summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "patent-summarization"], "is_gated": false}, "biosses": {"dataset_name": "biosses", "description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).", "downloads": 1330, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence1\": \"\\\"Here, looking for agents that could specifically ...\", \"sentence2\": \"\\\"Not surprisingly, GATA2 knockdown in KRAS mutant ...\", \"score\": \"2.2\"}", "columns": ["sentence1", "sentence2", "score"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "score": "score"}, "dataset_description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).\n", "dataset_name": "biosses"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blended_skill_talk": {"dataset_name": "blended_skill_talk", "description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.", "downloads": 615, "configs": {"default": {"config_name": "default", "sample_row": "{\"personas\": \"[\\\"i've 2 kids.\\\", \\\"i love flowers.\\\"]\", \"additional_context\": \"\\\"\\\"\", \"previous_utterance\": \"[\\\"I love live music, that's why I try to go to con...\", \"context\": \"\\\"empathetic_dialogues\\\"\", \"free_messages\": \"[\\\"I like acting, I hope to be an actor, what about...\", \"guided_messages\": \"[\\\"that is ok. have any kids?\\\", \\\"that is good. I h...\", \"suggestions.convai2\": \"[\\\"i love acting ! i'll be famous someday . what do...\", \"suggestions.empathetic_dialogues\": \"[\\\"Any favorite actors?\\\", \\\"One day.\\\", \\\"How long mus...\", \"suggestions.wizard_of_wikipedia\": \"[\\\"I would like to develop my acting skills. What a...\", \"guided_chosen_suggestions\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"label_candidates\": \"[]\"}", "columns": ["personas", "additional_context", "previous_utterance", "context", "free_messages", "guided_messages", "suggestions_convai2", "suggestions_empathetic_dialogues", "suggestions_wizard_of_wikipedia", "guided_chosen_suggestions", "label_candidates"], "columns_mapping": {"personas": "personas", "additional_context": "additional_context", "previous_utterance": "previous_utterance", "context": "context", "free_messages": "free_messages", "guided_messages": "guided_messages", "suggestions.convai2": "suggestions_convai2", "suggestions.empathetic_dialogues": "suggestions_empathetic_dialogues", "suggestions.wizard_of_wikipedia": "suggestions_wizard_of_wikipedia", "guided_chosen_suggestions": "guided_chosen_suggestions", "label_candidates": "label_candidates"}, "dataset_description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.\n", "dataset_name": "blended_skill_talk"}}, "tags": ["task_categories:conversational", "task_ids:dialogue-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blimp": {"dataset_name": "blimp", "description": "BLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.", "downloads": 36055, "configs": {"adjunct_island": {"config_name": "adjunct_island", "sample_row": "{\"sentence_good\": \"\\\"Who should Derek hug after shocking Richard?\\\"\", \"sentence_bad\": \"\\\"Who should Derek hug Richard after shocking?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"adjunct_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "anaphor_gender_agreement": {"config_name": "anaphor_gender_agreement", "sample_row": "{\"sentence_good\": \"\\\"Katherine can't help herself.\\\"\", \"sentence_bad\": \"\\\"Katherine can't help himself.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"anaphor_agreement\\\"\", \"UID\": \"\\\"anaphor_gender_agreement\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "anaphor_number_agreement": {"config_name": "anaphor_number_agreement", "sample_row": "{\"sentence_good\": \"\\\"Susan revealed herself.\\\"\", \"sentence_bad\": \"\\\"Susan revealed themselves.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"anaphor_agreement\\\"\", \"UID\": \"\\\"anaphor_number_agreement\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "animate_subject_passive": {"config_name": "animate_subject_passive", "sample_row": "{\"sentence_good\": \"\\\"Amanda was respected by some waitresses.\\\"\", \"sentence_bad\": \"\\\"Amanda was respected by some picture.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"s-selection\\\"\", \"UID\": \"\\\"animate_subject_passive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "animate_subject_trans": {"config_name": "animate_subject_trans", "sample_row": "{\"sentence_good\": \"\\\"Tina revealed Margaret.\\\"\", \"sentence_bad\": \"\\\"The horse revealed Margaret.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"s-selection\\\"\", \"UID\": \"\\\"animate_subject_trans\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "causative": {"config_name": "causative", "sample_row": "{\"sentence_good\": \"\\\"Aaron breaks the glass.\\\"\", \"sentence_bad\": \"\\\"Aaron appeared the glass.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"causative\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "complex_NP_island": {"config_name": "complex_NP_island", "sample_row": "{\"sentence_good\": \"\\\"Who aren't most hospitals that hadn't talked abou...\", \"sentence_bad\": \"\\\"Who aren't most waitresses alarming most hospital...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"complex_NP_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "coordinate_structure_constraint_complex_left_branch": {"config_name": "coordinate_structure_constraint_complex_left_branch", "sample_row": "{\"sentence_good\": \"\\\"What senators was Alicia approaching and some tea...\", \"sentence_bad\": \"\\\"What was Alicia approaching senators and some tea...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"coordinate_structure_constraint_complex_left_bran...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "coordinate_structure_constraint_object_extraction": {"config_name": "coordinate_structure_constraint_object_extraction", "sample_row": "{\"sentence_good\": \"\\\"Who were all men and Eric leaving?\\\"\", \"sentence_bad\": \"\\\"Who were all men leaving and Eric?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"coordinate_structure_constraint_object_extraction...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_1": {"config_name": "determiner_noun_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Raymond is selling this sketch.\\\"\", \"sentence_bad\": \"\\\"Raymond is selling this sketches.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_2": {"config_name": "determiner_noun_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"Some dog stunned this committee.\\\"\", \"sentence_bad\": \"\\\"Some dog stunned these committee.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_irregular_1": {"config_name": "determiner_noun_agreement_irregular_1", "sample_row": "{\"sentence_good\": \"\\\"Laurie hasn't lifted those cacti.\\\"\", \"sentence_bad\": \"\\\"Laurie hasn't lifted those cactus.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_irregular_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_irregular_2": {"config_name": "determiner_noun_agreement_irregular_2", "sample_row": "{\"sentence_good\": \"\\\"All boys boast about that child.\\\"\", \"sentence_bad\": \"\\\"All boys boast about those child.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_irregular_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_2": {"config_name": "determiner_noun_agreement_with_adj_2", "sample_row": "{\"sentence_good\": \"\\\"Cynthia scans these hard books.\\\"\", \"sentence_bad\": \"\\\"Cynthia scans this hard books.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_irregular_1": {"config_name": "determiner_noun_agreement_with_adj_irregular_1", "sample_row": "{\"sentence_good\": \"\\\"Some waiters broke this lost foot.\\\"\", \"sentence_bad\": \"\\\"Some waiters broke this lost feet.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_irregular_1\\\"...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_irregular_2": {"config_name": "determiner_noun_agreement_with_adj_irregular_2", "sample_row": "{\"sentence_good\": \"\\\"Alexander didn't walk through that new oasis.\\\"\", \"sentence_bad\": \"\\\"Alexander didn't walk through those new oasis.\\\"...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_irregular_2\\\"...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adjective_1": {"config_name": "determiner_noun_agreement_with_adjective_1", "sample_row": "{\"sentence_good\": \"\\\"Rebecca was criticizing those good documentaries....\", \"sentence_bad\": \"\\\"Rebecca was criticizing those good documentary.\\\"...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adjective_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "distractor_agreement_relational_noun": {"config_name": "distractor_agreement_relational_noun", "sample_row": "{\"sentence_good\": \"\\\"A niece of most senators hasn't descended most sl...\", \"sentence_bad\": \"\\\"A niece of most senators haven't descended most s...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"distractor_agreement_relational_noun\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "distractor_agreement_relative_clause": {"config_name": "distractor_agreement_relative_clause", "sample_row": "{\"sentence_good\": \"\\\"This customer who had visited most children has w...\", \"sentence_bad\": \"\\\"This customer who had visited most children have ...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"distractor_agreement_relative_clause\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "drop_argument": {"config_name": "drop_argument", "sample_row": "{\"sentence_good\": \"\\\"Travis is touring.\\\"\", \"sentence_bad\": \"\\\"Travis is revealing.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"drop_argument\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "ellipsis_n_bar_1": {"config_name": "ellipsis_n_bar_1", "sample_row": "{\"sentence_good\": \"\\\"Dawn's ex-husband wasn't going to one rough groce...\", \"sentence_bad\": \"\\\"Dawn's ex-husband wasn't going to one grocery sto...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"ellipsis\\\"\", \"UID\": \"\\\"ellipsis_n_bar_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "ellipsis_n_bar_2": {"config_name": "ellipsis_n_bar_2", "sample_row": "{\"sentence_good\": \"\\\"A friend of Pamela hasn't attacked one person and...\", \"sentence_bad\": \"\\\"A friend of Pamela hasn't attacked one unemployed...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"ellipsis\\\"\", \"UID\": \"\\\"ellipsis_n_bar_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_object_raising": {"config_name": "existential_there_object_raising", "sample_row": "{\"sentence_good\": \"\\\"William has declared there to be no guests gettin...\", \"sentence_bad\": \"\\\"William has obliged there to be no guests getting...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"existential_there_object_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_quantifiers_1": {"config_name": "existential_there_quantifiers_1", "sample_row": "{\"sentence_good\": \"\\\"There was a documentary about music irritating Al...\", \"sentence_bad\": \"\\\"There was each documentary about music irritating...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"existential_there_quantifiers_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_quantifiers_2": {"config_name": "existential_there_quantifiers_2", "sample_row": "{\"sentence_good\": \"\\\"All convertibles weren't there existing.\\\"\", \"sentence_bad\": \"\\\"There weren't all convertibles existing.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"existential_there_quantifiers_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_subject_raising": {"config_name": "existential_there_subject_raising", "sample_row": "{\"sentence_good\": \"\\\"There is soon to be a cat existing.\\\"\", \"sentence_bad\": \"\\\"There is willing to be a cat existing.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"existential_there_subject_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "expletive_it_object_raising": {"config_name": "expletive_it_object_raising", "sample_row": "{\"sentence_good\": \"\\\"Tara would ascertain it to be noteworthy that Ken...\", \"sentence_bad\": \"\\\"Tara wouldn't entice it to be noteworthy that Ken...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"expletive_it_object_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "inchoative": {"config_name": "inchoative", "sample_row": "{\"sentence_good\": \"\\\"Patricia had changed.\\\"\", \"sentence_bad\": \"\\\"Patricia had forgotten.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"inchoative\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "intransitive": {"config_name": "intransitive", "sample_row": "{\"sentence_good\": \"\\\"Todd can't yawn.\\\"\", \"sentence_bad\": \"\\\"Todd can't walk through.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"intransitive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_past_participle_adjectives": {"config_name": "irregular_past_participle_adjectives", "sample_row": "{\"sentence_good\": \"\\\"The hidden offspring aren't confident.\\\"\", \"sentence_bad\": \"\\\"The hid offspring aren't confident.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"irregular_forms\\\"\", \"UID\": \"\\\"irregular_past_participle_adjectives\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_past_participle_verbs": {"config_name": "irregular_past_participle_verbs", "sample_row": "{\"sentence_good\": \"\\\"The Borgias wore a lot of scarves.\\\"\", \"sentence_bad\": \"\\\"The Borgias worn a lot of scarves.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"irregular_forms\\\"\", \"UID\": \"\\\"irregular_past_participle_verbs\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_plural_subject_verb_agreement_1": {"config_name": "irregular_plural_subject_verb_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Those radii have scared that teenager.\\\"\", \"sentence_bad\": \"\\\"Those radii has scared that teenager.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"irregular_plural_subject_verb_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_plural_subject_verb_agreement_2": {"config_name": "irregular_plural_subject_verb_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"The women meet.\\\"\", \"sentence_bad\": \"\\\"The woman meet.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"irregular_plural_subject_verb_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "left_branch_island_echo_question": {"config_name": "left_branch_island_echo_question", "sample_row": "{\"sentence_good\": \"\\\"Irene had messed up whose rug?\\\"\", \"sentence_bad\": \"\\\"Whose had Irene messed up rug?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"left_branch_island_echo_question\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "left_branch_island_simple_question": {"config_name": "left_branch_island_simple_question", "sample_row": "{\"sentence_good\": \"\\\"Whose museums had Dana alarmed?\\\"\", \"sentence_bad\": \"\\\"Whose had Dana alarmed museums?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"left_branch_island_simple_question\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "matrix_question_npi_licensor_present": {"config_name": "matrix_question_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Had Bruce ever played?\\\"\", \"sentence_bad\": \"\\\"Bruce had ever played.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"matrix_question_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "npi_present_1": {"config_name": "npi_present_1", "sample_row": "{\"sentence_good\": \"\\\"Even Suzanne has really joked around.\\\"\", \"sentence_bad\": \"\\\"Even Suzanne has ever joked around.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"npi_present_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "npi_present_2": {"config_name": "npi_present_2", "sample_row": "{\"sentence_good\": \"\\\"Tamara really exited those mountains.\\\"\", \"sentence_bad\": \"\\\"Tamara ever exited those mountains.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"npi_present_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "only_npi_licensor_present": {"config_name": "only_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Only Bill would ever complain.\\\"\", \"sentence_bad\": \"\\\"Even Bill would ever complain.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"only_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "only_npi_scope": {"config_name": "only_npi_scope", "sample_row": "{\"sentence_good\": \"\\\"Only the grandsons of the Impressionists who Coll...\", \"sentence_bad\": \"\\\"The grandsons of the Impressionists who only Coll...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"only_npi_scope\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "passive_1": {"config_name": "passive_1", "sample_row": "{\"sentence_good\": \"\\\"Lucille's sisters are confused by Amy.\\\"\", \"sentence_bad\": \"\\\"Lucille's sisters are communicated by Amy.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"passive_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "passive_2": {"config_name": "passive_2", "sample_row": "{\"sentence_good\": \"\\\"A lot of nieces of some actor aren't scared.\\\"\", \"sentence_bad\": \"\\\"A lot of nieces of some actor aren't wept.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"passive_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_c_command": {"config_name": "principle_A_c_command", "sample_row": "{\"sentence_good\": \"\\\"A lot of patients who can sell some couch didn't ...\", \"sentence_bad\": \"\\\"A lot of patients who can sell some couch didn't ...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_c_command\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_case_1": {"config_name": "principle_A_case_1", "sample_row": "{\"sentence_good\": \"\\\"The teenagers explain that they aren't breaking a...\", \"sentence_bad\": \"\\\"The teenagers explain that themselves aren't brea...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_case_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_case_2": {"config_name": "principle_A_case_2", "sample_row": "{\"sentence_good\": \"\\\"Eric imagines himself taking every rug.\\\"\", \"sentence_bad\": \"\\\"Eric imagines himself took every rug.\\\"\", \"field\": \"\\\"syntax/semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_case_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_1": {"config_name": "principle_A_domain_1", "sample_row": "{\"sentence_good\": \"\\\"Carla had explained that Samuel has discussed her...\", \"sentence_bad\": \"\\\"Carla had explained that Samuel has discussed her...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_2": {"config_name": "principle_A_domain_2", "sample_row": "{\"sentence_good\": \"\\\"Donald can imagine those college campuses are bor...\", \"sentence_bad\": \"\\\"Donald can imagine those college campuses are bor...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_3": {"config_name": "principle_A_domain_3", "sample_row": "{\"sentence_good\": \"\\\"Steven explains Kayla won't hurt herself.\\\"\", \"sentence_bad\": \"\\\"Kayla explains Steven won't hurt herself.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_3\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_reconstruction": {"config_name": "principle_A_reconstruction", "sample_row": "{\"sentence_good\": \"\\\"It's himself that this cashier attacked.\\\"\", \"sentence_bad\": \"\\\"It's himself that attacked this cashier.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_reconstruction\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "regular_plural_subject_verb_agreement_1": {"config_name": "regular_plural_subject_verb_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Paula references Robert.\\\"\", \"sentence_bad\": \"\\\"Paula reference Robert.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"regular_plural_subject_verb_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "regular_plural_subject_verb_agreement_2": {"config_name": "regular_plural_subject_verb_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"The students perform.\\\"\", \"sentence_bad\": \"\\\"The student perform.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"regular_plural_subject_verb_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_negation_npi_licensor_present": {"config_name": "sentential_negation_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Teresa had not ever sold a movie theater.\\\"\", \"sentence_bad\": \"\\\"Teresa had probably ever sold a movie theater.\\\"...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"sentential_negation_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_negation_npi_scope": {"config_name": "sentential_negation_npi_scope", "sample_row": "{\"sentence_good\": \"\\\"The associations that had worried Cynthia have no...\", \"sentence_bad\": \"\\\"The associations that had not worried Cynthia hav...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"sentential_negation_npi_scope\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_subject_island": {"config_name": "sentential_subject_island", "sample_row": "{\"sentence_good\": \"\\\"Who had the patients' cleaning those banks upset....\", \"sentence_bad\": \"\\\"Who had the patients' cleaning upset those banks....\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"sentential_subject_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "superlative_quantifiers_1": {"config_name": "superlative_quantifiers_1", "sample_row": "{\"sentence_good\": \"\\\"No girl attacked fewer than two waiters.\\\"\", \"sentence_bad\": \"\\\"No girl attacked at most two waiters.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"superlative_quantifiers_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "superlative_quantifiers_2": {"config_name": "superlative_quantifiers_2", "sample_row": "{\"sentence_good\": \"\\\"The teenager does tour at most nine restaurants.\\\"...\", \"sentence_bad\": \"\\\"No teenager does tour at most nine restaurants.\\\"...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"superlative_quantifiers_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "tough_vs_raising_1": {"config_name": "tough_vs_raising_1", "sample_row": "{\"sentence_good\": \"\\\"James is pleasant to flee from.\\\"\", \"sentence_bad\": \"\\\"James is apt to flee from.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"tough_vs_raising_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "tough_vs_raising_2": {"config_name": "tough_vs_raising_2", "sample_row": "{\"sentence_good\": \"\\\"Every hospital isn't about to tempt Tiffany to re...\", \"sentence_bad\": \"\\\"Every hospital isn't fun to tempt Tiffany to refe...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"tough_vs_raising_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "transitive": {"config_name": "transitive", "sample_row": "{\"sentence_good\": \"\\\"Some turtles alarm Kimberley.\\\"\", \"sentence_bad\": \"\\\"Some turtles come here Kimberley.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"transitive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_island": {"config_name": "wh_island", "sample_row": "{\"sentence_good\": \"\\\"Who have those men revealed they helped?\\\"\", \"sentence_bad\": \"\\\"Who have those men revealed who helped?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"wh_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_object_gap": {"config_name": "wh_questions_object_gap", "sample_row": "{\"sentence_good\": \"\\\"Joel discovered the vase that Patricia might take...\", \"sentence_bad\": \"\\\"Joel discovered what Patricia might take the vase...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_object_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_subject_gap": {"config_name": "wh_questions_subject_gap", "sample_row": "{\"sentence_good\": \"\\\"Brian had questioned an association that can asto...\", \"sentence_bad\": \"\\\"Brian had questioned who an association can astou...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_subject_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_subject_gap_long_distance": {"config_name": "wh_questions_subject_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Dennis has seen this tooth that Kristin wasn't co...\", \"sentence_bad\": \"\\\"Dennis has seen who this tooth that Kristin wasn'...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_subject_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_no_gap": {"config_name": "wh_vs_that_no_gap", "sample_row": "{\"sentence_good\": \"\\\"Mark figured out that most governments appreciate...\", \"sentence_bad\": \"\\\"Mark figured out who most governments appreciate ...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_no_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_no_gap_long_distance": {"config_name": "wh_vs_that_no_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Every association figured out that most drivers t...\", \"sentence_bad\": \"\\\"Every association figured out who most drivers th...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_no_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_with_gap": {"config_name": "wh_vs_that_with_gap", "sample_row": "{\"sentence_good\": \"\\\"A lady has remembered who the actors conceal.\\\"\", \"sentence_bad\": \"\\\"A lady has remembered that the actors conceal.\\\"...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_with_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_with_gap_long_distance": {"config_name": "wh_vs_that_with_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Kayla concealed who a lot of guests that were sca...\", \"sentence_bad\": \"\\\"Kayla concealed that a lot of guests that were sc...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_with_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blog_authorship_corpus": {"dataset_name": "blog_authorship_corpus", "description": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.\n\nEach blog is presented as a separate file, the name of which indicates a blogger id# and the blogger\u2019s self-provided gender, age, industry and astrological sign. (All are labeled for gender and age but for many, industry and/or sign is marked as unknown.)\n\nAll bloggers included in the corpus fall into one of three age groups:\n- 8240 \"10s\" blogs (ages 13-17),\n- 8086 \"20s\" blogs (ages 23-27),\n- 2994 \"30s\" blogs (ages 33-47).\n\nFor each age group there are an equal number of male and female bloggers.\n\nEach blog in the corpus includes at least 200 occurrences of common English words. All formatting has been stripped with two exceptions. Individual posts within a single blogger are separated by the date of the following post and links within a post are denoted by the label urllink.\n\nThe corpus may be freely used for non-commercial research purposes.", "downloads": 406, "configs": {"blog_authorship_corpus": {"config_name": "blog_authorship_corpus", "sample_row": "{\"text\": \"\\\"Yeah, sorry for not writing for a whole there, bu...\", \"date\": \"\\\"23,November,2002\\\"\", \"gender\": \"\\\"female\\\"\", \"age\": \"17\", \"horoscope\": \"\\\"Libra\\\"\", \"job\": \"\\\"Student\\\"\"}", "columns": ["text", "date", "gender", "age", "horoscope", "job"], "columns_mapping": {"text": "text", "date": "date", "gender": "gender", "age": "age", "horoscope": "horoscope", "job": "job"}, "dataset_description": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.\n\nEach blog is presented as a separate file, the name of which indicates a blogger id# and the blogger\u2019s self-provided gender, age, industry and astrological sign. (All are labeled for gender and age but for many, industry and/or sign is marked as unknown.)\n\nAll bloggers included in the corpus fall into one of three age groups:\n- 8240 \"10s\" blogs (ages 13-17),\n- 8086 \"20s\" blogs (ages 23-27),\n- 2994 \"30s\" blogs (ages 33-47).\n\nFor each age group there are an equal number of male and female bloggers.\n\nEach blog in the corpus includes at least 200 occurrences of common English words. All formatting has been stripped with two exceptions. Individual posts within a single blogger are separated by the date of the following post and links within a post are denoted by the label urllink.\n\nThe corpus may be freely used for non-commercial research purposes.\n", "dataset_name": "blog_authorship_corpus"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bn_hate_speech": {"dataset_name": "bn_hate_speech", "description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.", "downloads": 1094, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"\\\\u0987\\\\u09a8\\\\u09bf\\\\u0987 \\\\u09b9\\\\u099a\\\\u09cd\\\\u099b...\", \"label\": \"3\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "dataset_name": "bn_hate_speech"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:bn", "hate-speech-topic-classification"], "is_gated": false}, "bookcorpus": {"dataset_name": "bookcorpus", "description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. \\", "downloads": 16732, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"usually , he would be tearing around the living r...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. ", "dataset_name": "bookcorpus"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "boolq": {"dataset_name": "boolq", "description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.", "downloads": 27215, "configs": {"default": {"config_name": "default", "sample_row": "{\"question\": \"\\\"do iran and afghanistan speak the same language\\\"...\", \"answer\": \"true\", \"passage\": \"\\\"Persian (/\\\\u02c8p\\\\u025c\\\\u02d0r\\\\u0292\\\\u0259n, -\\\\u0...\"}", "columns": ["question", "answer", "passage"], "columns_mapping": {"question": "question", "answer": "answer", "passage": "passage"}, "dataset_description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.\n", "dataset_name": "boolq"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bsd_ja_en": {"dataset_name": "bsd_ja_en", "description": "This is the Business Scene Dialogue (BSD) dataset,\na Japanese-English parallel corpus containing written conversations\nin various business scenarios.\n\nThe dataset was constructed in 3 steps:\n 1) selecting business scenes,\n 2) writing monolingual conversation scenarios according to the selected scenes, and\n 3) translating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese\nand the other half were written in English.\n\nFields:\n- id: dialogue identifier\n- no: sentence pair number within a dialogue\n- en_speaker: speaker name in English\n- ja_speaker: speaker name in Japanese\n- en_sentence: sentence in English\n- ja_sentence: sentence in Japanese\n- original_language: language in which monolingual scenario was written\n- tag: scenario\n- title: scenario title", "downloads": 443, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"190329_J07_03\\\"\", \"tag\": \"\\\"phone call\\\"\", \"title\": \"\\\"\\\\u4f1d\\\\u8a00\\\\u3078\\\\u306e\\\\u6298\\\\u308a\\\\u8fd4\\\\u3057\\\\...\", \"original_language\": \"\\\"ja\\\"\", \"no\": \"1\", \"en_speaker\": \"\\\"Doi-san\\\"\", \"ja_speaker\": \"\\\"\\\\u571f\\\\u4e95\\\\u3055\\\\u3093\\\"\", \"en_sentence\": \"\\\"Hi this is the systems development department of ...\", \"ja_sentence\": \"\\\"\\\\u306f\\\\u3044\\\\u3001K\\\\u793e\\\\u30b7\\\\u30b9\\\\u30c6\\\\u30e0...\"}", "columns": ["id", "tag", "title", "original_language", "no", "en_speaker", "ja_speaker", "en_sentence", "ja_sentence"], "columns_mapping": {"id": "id", "tag": "tag", "title": "title", "original_language": "original_language", "no": "no", "en_speaker": "en_speaker", "ja_speaker": "ja_speaker", "en_sentence": "en_sentence", "ja_sentence": "ja_sentence"}, "dataset_description": "This is the Business Scene Dialogue (BSD) dataset,\na Japanese-English parallel corpus containing written conversations\nin various business scenarios.\n\nThe dataset was constructed in 3 steps:\n 1) selecting business scenes,\n 2) writing monolingual conversation scenarios according to the selected scenes, and\n 3) translating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese\nand the other half were written in English.\n\nFields:\n- id: dialogue identifier\n- no: sentence pair number within a dialogue\n- en_speaker: speaker name in English\n- ja_speaker: speaker name in Japanese\n- en_sentence: sentence in English\n- ja_sentence: sentence in Japanese\n- original_language: language in which monolingual scenario was written\n- tag: scenario\n- title: scenario title\n", "dataset_name": "bsd_ja_en"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:ja", "business-conversations-translation"], "is_gated": false}, "c3": {"dataset_name": "c3", "description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.", "downloads": 1234, "configs": {"mixed": {"config_name": "mixed", "sample_row": "{\"documents\": \"[\\\"\\\\u8bb8\\\\u591a\\\\u52a8\\\\u7269\\\\u7684\\\\u67d0\\\\u4e9b\\\\u5668...\", \"document_id\": \"\\\"m13-70\\\"\", \"questions.question\": \"[\\\"\\\\u52a8\\\\u7269\\\\u7684\\\\u5668\\\\u5b98\\\\u611f\\\\u89c9\\\\u4e0e...\", \"questions.answer\": \"[\\\"\\\\u6bd4\\\\u4eba\\\\u7684\\\\u7075\\\\u654f\\\", \\\"\\\\u6c34\\\\u6bcd\\\",...\", \"questions.choice\": \"[[\\\"\\\\u6ca1\\\\u6709\\\\u4eba\\\\u7684\\\\u7075\\\\u654f\\\", \\\"\\\\u548c\\\\...\"}", "columns": ["documents", "document_id", "questions_question", "questions_answer", "questions_choice"], "columns_mapping": {"documents": "documents", "document_id": "document_id", "questions.question": "questions_question", "questions.answer": "questions_answer", "questions.choice": "questions_choice"}, "dataset_description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.\n", "dataset_name": "c3"}, "dialog": {"config_name": "dialog", "sample_row": "{\"documents\": \"[\\\"\\\\u7537\\\\uff1a\\\\u4f60\\\\u4eca\\\\u5929\\\\u665a\\\\u4e0a\\\\u6709...\", \"document_id\": \"\\\"25-35\\\"\", \"questions.question\": \"[\\\"\\\\u5973\\\\u7684\\\\u6700\\\\u559c\\\\u6b22\\\\u54ea\\\\u79cd\\\\u7535...\", \"questions.answer\": \"[\\\"\\\\u559c\\\\u5267\\\\u7247\\\"]\", \"questions.choice\": \"[[\\\"\\\\u6050\\\\u6016\\\\u7247\\\", \\\"\\\\u7231\\\\u60c5\\\\u7247\\\", \\\"\\\\u5...\"}", "columns": ["documents", "document_id", "questions_question", "questions_answer", "questions_choice"], "columns_mapping": {"documents": "documents", "document_id": "document_id", "questions.question": "questions_question", "questions.answer": "questions_answer", "questions.choice": "questions_choice"}, "dataset_description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.\n", "dataset_name": "c3"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "c4": {"dataset_name": "c4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.", "downloads": 68692, "configs": {"en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"Beginners BBQ Class Taking Place in Missoula!\\\\nDo...\", \"timestamp\": \"\\\"2019-04-25T12:57:54Z\\\"\", \"url\": \"\\\"https://klyq.com/beginners-bbq-class-taking-place...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "realnewslike": {"config_name": "realnewslike", "sample_row": "{\"text\": \"\\\"After the martyrdom of St. Boniface, Vergilius wa...\", \"timestamp\": \"\\\"2019-04-22T08:07:02Z\\\"\", \"url\": \"\\\"https://www.catholic.org/encyclopedia/view.php?id...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "en.noblocklist": {"config_name": "en.noblocklist", "sample_row": "{\"text\": \"\\\"Beginners BBQ Class Taking Place in Missoula!\\\\nDo...\", \"timestamp\": \"\\\"2019-04-25T12:57:54Z\\\"\", \"url\": \"\\\"https://klyq.com/beginners-bbq-class-taking-place...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "en.noclean": {"config_name": "en.noclean", "sample_row": "{\"text\": \"\\\"November 24, 2016 \\\\u2013 World News, Breaking New...\", \"timestamp\": \"\\\"2019-04-24T16:35:11Z\\\"\", \"url\": \"\\\"http://sevendaynews.com/2016/11/24/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:en"], "is_gated": false}, "caner": {"dataset_name": "caner", "description": "Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities.", "downloads": 347, "configs": {"default": {"config_name": "default", "sample_row": "{\"token\": \"\\\"\\\\u0627\\\\u0644\\\\u062c\\\\u0627\\\\u0645\\\\u0639\\\"\", \"ner_tag\": \"1\"}", "columns": ["token", "ner_tag"], "columns_mapping": {"token": "token", "ner_tag": "ner_tag"}, "dataset_description": "Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities.\n", "dataset_name": "caner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "casino": {"dataset_name": "casino", "description": "We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. This helps to overcome the limitations of prior negotiation datasets such as Deal or No Deal and Craigslist Bargain. Each dialogue consists of rich meta-data including participant demographics, personality, and their subjective evaluation of the negotiation in terms of satisfaction and opponent likeness.", "downloads": 338, "configs": {"default": {"config_name": "default", "sample_row": "{\"chat_logs\": \"[{\\\"text\\\": \\\"Hello! \\\\ud83d\\\\ude42 Let's work together...\", \"participant_info.mturk_agent_1.value2issue.Low\": \"\\\"Water\\\"\", \"participant_info.mturk_agent_1.value2issue.Medium\": \"\\\"Food\\\"\", \"participant_info.mturk_agent_1.value2issue.High\": \"\\\"Firewood\\\"\", \"participant_info.mturk_agent_1.value2reason.Low\": \"\\\"Our group has sufficient water from our complemen...\", \"participant_info.mturk_agent_1.value2reason.Medium\": \"\\\"Extra food will be needed to feed our larger than...\", \"participant_info.mturk_agent_1.value2reason.High\": \"\\\"We have a larger group than normal and therefore ...\", \"participant_info.mturk_agent_1.outcomes.points_scored\": \"19\", \"participant_info.mturk_agent_1.outcomes.satisfaction\": \"\\\"Slightly satisfied\\\"\", \"participant_info.mturk_agent_1.outcomes.opponent_likeness\": \"\\\"Slightly like\\\"\", \"participant_info.mturk_agent_1.demographics.age\": \"43\", \"participant_info.mturk_agent_1.demographics.gender\": \"\\\"male\\\"\", \"participant_info.mturk_agent_1.demographics.ethnicity\": \"\\\"white american\\\"\", \"participant_info.mturk_agent_1.demographics.education\": \"\\\"some 4 year college, bachelor's degree\\\"\", \"participant_info.mturk_agent_1.personality.svo\": \"\\\"proself\\\"\", \"participant_info.mturk_agent_1.personality.big-five.extraversion\": \"5.0\", \"participant_info.mturk_agent_1.personality.big-five.agreeableness\": \"6.0\", \"participant_info.mturk_agent_1.personality.big-five.conscientiousness\": \"6.0\", \"participant_info.mturk_agent_1.personality.big-five.emotional-stability\": \"5.0\", \"participant_info.mturk_agent_1.personality.big-five.openness-to-experiences\": \"5.5\", \"participant_info.mturk_agent_2.value2issue.Low\": \"\\\"Food\\\"\", \"participant_info.mturk_agent_2.value2issue.Medium\": \"\\\"Water\\\"\", \"participant_info.mturk_agent_2.value2issue.High\": \"\\\"Firewood\\\"\", \"participant_info.mturk_agent_2.value2reason.Low\": \"\\\"i'm on a diet, trying to lose weight.\\\"\", \"participant_info.mturk_agent_2.value2reason.Medium\": \"\\\"i'm dehydrated, and i need to drink constantly.\\\"...\", \"participant_info.mturk_agent_2.value2reason.High\": \"\\\"my dog has fleas, the fire repels them.\\\"\", \"participant_info.mturk_agent_2.outcomes.points_scored\": \"18\", \"participant_info.mturk_agent_2.outcomes.satisfaction\": \"\\\"Extremely satisfied\\\"\", \"participant_info.mturk_agent_2.outcomes.opponent_likeness\": \"\\\"Extremely like\\\"\", \"participant_info.mturk_agent_2.demographics.age\": \"22\", \"participant_info.mturk_agent_2.demographics.gender\": \"\\\"female\\\"\", \"participant_info.mturk_agent_2.demographics.ethnicity\": \"\\\"asian american\\\"\", \"participant_info.mturk_agent_2.demographics.education\": \"\\\"some 4 year college, bachelor's degree\\\"\", \"participant_info.mturk_agent_2.personality.svo\": \"\\\"proself\\\"\", \"participant_info.mturk_agent_2.personality.big-five.extraversion\": \"4.0\", \"participant_info.mturk_agent_2.personality.big-five.agreeableness\": \"6.0\", \"participant_info.mturk_agent_2.personality.big-five.conscientiousness\": \"5.5\", \"participant_info.mturk_agent_2.personality.big-five.emotional-stability\": \"3.0\", \"participant_info.mturk_agent_2.personality.big-five.openness-to-experiences\": \"7.0\", \"annotations\": \"[[\\\"Hello! \\\\ud83d\\\\ude42 Let's work together on a de...\"}", "columns": ["chat_logs", "participant_info_mturk_agent_1_value2issue_Low", "participant_info_mturk_agent_1_value2issue_Medium", "participant_info_mturk_agent_1_value2issue_High", "participant_info_mturk_agent_1_value2reason_Low", "participant_info_mturk_agent_1_value2reason_Medium", "participant_info_mturk_agent_1_value2reason_High", "participant_info_mturk_agent_1_outcomes_points_scored", "participant_info_mturk_agent_1_outcomes_satisfaction", "participant_info_mturk_agent_1_outcomes_opponent_likeness", "participant_info_mturk_agent_1_demographics_age", "participant_info_mturk_agent_1_demographics_gender", "participant_info_mturk_agent_1_demographics_ethnicity", "participant_info_mturk_agent_1_demographics_education", "participant_info_mturk_agent_1_personality_svo", "participant_info_mturk_agent_1_personality_big-five_extraversion", "participant_info_mturk_agent_1_personality_big-five_agreeableness", "participant_info_mturk_agent_1_personality_big-five_conscientiousness", "participant_info_mturk_agent_1_personality_big-five_emotional-stability", "participant_info_mturk_agent_1_personality_big-five_openness-to-experiences", "participant_info_mturk_agent_2_value2issue_Low", "participant_info_mturk_agent_2_value2issue_Medium", "participant_info_mturk_agent_2_value2issue_High", "participant_info_mturk_agent_2_value2reason_Low", "participant_info_mturk_agent_2_value2reason_Medium", "participant_info_mturk_agent_2_value2reason_High", "participant_info_mturk_agent_2_outcomes_points_scored", "participant_info_mturk_agent_2_outcomes_satisfaction", "participant_info_mturk_agent_2_outcomes_opponent_likeness", "participant_info_mturk_agent_2_demographics_age", "participant_info_mturk_agent_2_demographics_gender", "participant_info_mturk_agent_2_demographics_ethnicity", "participant_info_mturk_agent_2_demographics_education", "participant_info_mturk_agent_2_personality_svo", "participant_info_mturk_agent_2_personality_big-five_extraversion", "participant_info_mturk_agent_2_personality_big-five_agreeableness", "participant_info_mturk_agent_2_personality_big-five_conscientiousness", "participant_info_mturk_agent_2_personality_big-five_emotional-stability", "participant_info_mturk_agent_2_personality_big-five_openness-to-experiences", "annotations"], "columns_mapping": {"chat_logs": "chat_logs", "participant_info.mturk_agent_1.value2issue.Low": "participant_info_mturk_agent_1_value2issue_Low", "participant_info.mturk_agent_1.value2issue.Medium": "participant_info_mturk_agent_1_value2issue_Medium", "participant_info.mturk_agent_1.value2issue.High": "participant_info_mturk_agent_1_value2issue_High", "participant_info.mturk_agent_1.value2reason.Low": "participant_info_mturk_agent_1_value2reason_Low", "participant_info.mturk_agent_1.value2reason.Medium": "participant_info_mturk_agent_1_value2reason_Medium", "participant_info.mturk_agent_1.value2reason.High": "participant_info_mturk_agent_1_value2reason_High", "participant_info.mturk_agent_1.outcomes.points_scored": "participant_info_mturk_agent_1_outcomes_points_scored", "participant_info.mturk_agent_1.outcomes.satisfaction": "participant_info_mturk_agent_1_outcomes_satisfaction", "participant_info.mturk_agent_1.outcomes.opponent_likeness": "participant_info_mturk_agent_1_outcomes_opponent_likeness", "participant_info.mturk_agent_1.demographics.age": "participant_info_mturk_agent_1_demographics_age", "participant_info.mturk_agent_1.demographics.gender": "participant_info_mturk_agent_1_demographics_gender", "participant_info.mturk_agent_1.demographics.ethnicity": "participant_info_mturk_agent_1_demographics_ethnicity", "participant_info.mturk_agent_1.demographics.education": "participant_info_mturk_agent_1_demographics_education", "participant_info.mturk_agent_1.personality.svo": "participant_info_mturk_agent_1_personality_svo", "participant_info.mturk_agent_1.personality.big-five.extraversion": "participant_info_mturk_agent_1_personality_big-five_extraversion", "participant_info.mturk_agent_1.personality.big-five.agreeableness": "participant_info_mturk_agent_1_personality_big-five_agreeableness", "participant_info.mturk_agent_1.personality.big-five.conscientiousness": "participant_info_mturk_agent_1_personality_big-five_conscientiousness", "participant_info.mturk_agent_1.personality.big-five.emotional-stability": "participant_info_mturk_agent_1_personality_big-five_emotional-stability", "participant_info.mturk_agent_1.personality.big-five.openness-to-experiences": "participant_info_mturk_agent_1_personality_big-five_openness-to-experiences", "participant_info.mturk_agent_2.value2issue.Low": "participant_info_mturk_agent_2_value2issue_Low", "participant_info.mturk_agent_2.value2issue.Medium": "participant_info_mturk_agent_2_value2issue_Medium", "participant_info.mturk_agent_2.value2issue.High": "participant_info_mturk_agent_2_value2issue_High", "participant_info.mturk_agent_2.value2reason.Low": "participant_info_mturk_agent_2_value2reason_Low", "participant_info.mturk_agent_2.value2reason.Medium": "participant_info_mturk_agent_2_value2reason_Medium", "participant_info.mturk_agent_2.value2reason.High": "participant_info_mturk_agent_2_value2reason_High", "participant_info.mturk_agent_2.outcomes.points_scored": "participant_info_mturk_agent_2_outcomes_points_scored", "participant_info.mturk_agent_2.outcomes.satisfaction": "participant_info_mturk_agent_2_outcomes_satisfaction", "participant_info.mturk_agent_2.outcomes.opponent_likeness": "participant_info_mturk_agent_2_outcomes_opponent_likeness", "participant_info.mturk_agent_2.demographics.age": "participant_info_mturk_agent_2_demographics_age", "participant_info.mturk_agent_2.demographics.gender": "participant_info_mturk_agent_2_demographics_gender", "participant_info.mturk_agent_2.demographics.ethnicity": "participant_info_mturk_agent_2_demographics_ethnicity", "participant_info.mturk_agent_2.demographics.education": "participant_info_mturk_agent_2_demographics_education", "participant_info.mturk_agent_2.personality.svo": "participant_info_mturk_agent_2_personality_svo", "participant_info.mturk_agent_2.personality.big-five.extraversion": "participant_info_mturk_agent_2_personality_big-five_extraversion", "participant_info.mturk_agent_2.personality.big-five.agreeableness": "participant_info_mturk_agent_2_personality_big-five_agreeableness", "participant_info.mturk_agent_2.personality.big-five.conscientiousness": "participant_info_mturk_agent_2_personality_big-five_conscientiousness", "participant_info.mturk_agent_2.personality.big-five.emotional-stability": "participant_info_mturk_agent_2_personality_big-five_emotional-stability", "participant_info.mturk_agent_2.personality.big-five.openness-to-experiences": "participant_info_mturk_agent_2_personality_big-five_openness-to-experiences", "annotations": "annotations"}, "dataset_description": "We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. This helps to overcome the limitations of prior negotiation datasets such as Deal or No Deal and Craigslist Bargain. Each dialogue consists of rich meta-data including participant demographics, personality, and their subjective evaluation of the negotiation in terms of satisfaction and opponent likeness.\n", "dataset_name": "casino"}}, "tags": ["task_categories:conversational", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "catalonia_independence": {"dataset_name": "catalonia_independence", "description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.", "downloads": 510, "configs": {"catalan": {"config_name": "catalan", "sample_row": "{\"id_str\": \"\\\"11028517837209518e+18\\\"\", \"TWEET\": \"\\\"En @fgarrobo ha fet m\\\\u00e9s per l\\\\u2019independe...\", \"LABEL\": \"0\"}", "columns": ["id_str", "TWEET", "LABEL"], "columns_mapping": {"id_str": "id_str", "TWEET": "TWEET", "LABEL": "LABEL"}, "dataset_description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.\n", "dataset_name": "catalonia_independence"}, "spanish": {"config_name": "spanish", "sample_row": "{\"id_str\": \"\\\"1099284472267182080\\\"\", \"TWEET\": \"\\\"RT @EFEnoticias: Arrimadas se presenta a las gene...\", \"LABEL\": \"0\"}", "columns": ["id_str", "TWEET", "LABEL"], "columns_mapping": {"id_str": "id_str", "TWEET": "TWEET", "LABEL": "LABEL"}, "dataset_description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.\n", "dataset_name": "catalonia_independence"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ca", "language:es", "stance-detection"], "is_gated": false}, "cbt": {"dataset_name": "cbt", "description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.", "downloads": 1667, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"title\": \"\\\"Andrew_Lang___Prince_Prigio.txt.out\\\"\", \"content\": \"\\\"CHAPTER I. -LCB- Chapter heading picture : p1.jpg...\"}", "columns": ["title", "content"], "columns_mapping": {"title": "title", "content": "content"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "V": {"config_name": "V", "sample_row": "{\"sentences\": \"[\\\"This vexed the king even more than the queen , w...\", \"question\": \"\\\"`` They are very kind old ladies in their way , '...\", \"answer\": \"\\\"said\\\"\", \"options\": \"[\\\"christening\\\", \\\"existed\\\", \\\"hear\\\", \\\"knows\\\", \\\"read\\\"...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "P": {"config_name": "P", "sample_row": "{\"sentences\": \"[\\\"CHAPTER I. -LCB- Chapter heading picture : p1.jp...\", \"question\": \"\\\"`` You have not forgotten any XXXXX our aunts ? '...\", \"answer\": \"\\\"of\\\"\", \"options\": \"[\\\"With\\\", \\\"before\\\", \\\"in\\\", \\\"of\\\", \\\"on\\\", \\\"than\\\", \\\"that...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "NE": {"config_name": "NE", "sample_row": "{\"sentences\": \"[\\\"Some were abroad ; several were ill ; a few were...\", \"question\": \"\\\"I think I 've told you that his name was XXXXX --...\", \"answer\": \"\\\"Prigio\\\"\", \"options\": \"[\\\"CHAPTER\\\", \\\"Flitter\\\", \\\"Prigio\\\", \\\"Saracens\\\", \\\"lumb...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "CN": {"config_name": "CN", "sample_row": "{\"sentences\": \"[\\\"With almost everything else to make them happy ,...\", \"question\": \"\\\"replied the XXXXX ; for the king 's aunts were ol...\", \"answer\": \"\\\"queen\\\"\", \"options\": \"[\\\"ancestors\\\", \\\"baby\\\", \\\"boy\\\", \\\"everyone\\\", \\\"fairies\\\"...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}}, "tags": ["task_categories:other", "task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cc100": {"dataset_name": "cc100", "description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.", "downloads": 9293, "configs": {"am": {"config_name": "am", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u1270\\\\u1208\\\\u12cb\\\\u12cb\\\\u132d \\\\u12e8\\\\u130d\\\\u12f5...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}, "sr": {"config_name": "sr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u2626 \\\\u041e\\\\u0431\\\\u043d\\\\u0430\\\\u0432\\\\u0459\\\\u0430...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}, "ka": {"config_name": "ka", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u10d4\\\\u10e0\\\\u10dd\\\\u10d5\\\\u10dc\\\\u10e3\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:ar", "language:as", "language:az", "language:be", "language:bg", "language:bn", "language:br", "language:bs", "language:ca", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:ff", "language:fi", "language:fr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gu", "language:ha", "language:he", "language:hi", "language:hr", "language:ht", "language:hu", "language:hy", "language:id", "language:ig", "language:is", "language:it", "language:ja", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:ln", "language:lo", "language:lt", "language:lv", "language:mg", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:my", "language:ne", "language:nl", "language:no", "language:ns", "language:om", "language:or", "language:pa", "language:pl", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:ru", "language:sa", "language:sc", "language:sd", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:ss", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:th", "language:tl", "language:tn", "language:tr", "language:ug", "language:uk", "language:ur", "language:uz", "language:vi", "language:wo", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "cc_news": {"dataset_name": "cc_news", "description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.", "downloads": 2681, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"title\": \"\\\"Daughter Duo is Dancing in The Same Company\\\"\", \"text\": \"\\\"There's a surprising twist to Regina Willoughby's...\", \"domain\": \"\\\"www.pointemagazine.com\\\"\", \"date\": \"\\\"2017-12-11 20:19:05\\\"\", \"description\": \"\\\"There's a surprising twist to Regina Willoughby's...\", \"url\": \"\\\"http://www.pointemagazine.com/mother-daughter-duo...\", \"image_url\": \"\\\"https://pointe-img.rbl.ms/simage/https%3A%2F%2Fas...\"}", "columns": ["title", "text", "domain", "date", "description", "url", "image_url"], "columns_mapping": {"title": "title", "text": "text", "domain": "domain", "date": "date", "description": "description", "url": "url", "image_url": "image_url"}, "dataset_description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.\n", "dataset_name": "cc_news"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ccaligned_multilingual": {"dataset_name": "ccaligned_multilingual", "description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).", "downloads": 1220, "configs": {"documents-zz_TR": {"config_name": "documents-zz_TR", "sample_row": "{\"Domain\": \"\\\"wext.it\\\"\", \"Source_URL\": \"\\\"http://wext.it/en/\\\"\", \"Target_URL\": \"\\\"https://wext.it/\\\"\", \"translation.en_XX\": \"\\\"wext.it|wext.it|Software WEXT|Blockchain, artific...\", \"translation.zz_TR\": \"\\\"Wext|\\\"\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_zz_TR"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.zz_TR": "translation_zz_TR"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-zz_TR": {"config_name": "sentences-zz_TR", "sample_row": "{\"translation.en_XX\": \"\\\"YADORU KYOTO Kagami no Yado _Official Site_YADORU...\", \"translation.zz_TR\": \"\\\"Washi No YadoKagami No YadoKanade No YadoMizunoe ...\", \"LASER_similarity\": \"1.1320143\"}", "columns": ["translation_en_XX", "translation_zz_TR", "LASER_similarity"], "columns_mapping": {"translation.en_XX": "translation_en_XX", "translation.zz_TR": "translation_zz_TR", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "documents-tz_MA": {"config_name": "documents-tz_MA", "sample_row": "{\"Domain\": \"\\\"kasahorow.org\\\"\", \"Source_URL\": \"\\\"https://men.kasahorow.org/app/b\\\"\", \"Target_URL\": \"\\\"http://tzm.kasahorow.org/app/b\\\"\", \"translation.en_XX\": \"\\\"Read _ Mende kasahorow|Menu|Alikamisa. Saa 19, 20...\", \"translation.tz_MA\": \"\\\"Read _ Tamazight kasahorow|Menu|\\\\u2d30\\\\u2d59\\\\u2d3...\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_tz_MA"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.tz_MA": "translation_tz_MA"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-tz_MA": {"config_name": "sentences-tz_MA", "sample_row": "{\"translation.en_XX\": \"\\\"Tuesday 18 June 2019 _ 14:41\\\"\", \"translation.tz_MA\": \"\\\"\\\\u2d30\\\\u2d3d\\\\u2d61\\\\u2d30\\\\u2d59 14 \\\\u2d4f\\\\u2d53\\\\u2...\", \"LASER_similarity\": \"1.2042842\"}", "columns": ["translation_en_XX", "translation_tz_MA", "LASER_similarity"], "columns_mapping": {"translation.en_XX": "translation_en_XX", "translation.tz_MA": "translation_tz_MA", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "documents-ak_GH": {"config_name": "documents-ak_GH", "sample_row": "{\"Domain\": \"\\\"islamhouse.com\\\"\", \"Source_URL\": \"\\\"https://islamhouse.com/en/audios/373088/\\\"\", \"Target_URL\": \"\\\"https://islamhouse.com/ak/audios/373088/\\\"\", \"translation.en_XX\": \"\\\"SUMMARY in the jurisprudence of Umrah - Arabic - ...\", \"translation.ak_GH\": \"\\\"Ntwatiaa / w\\\\u0254ab\\\\u0254 no t\\\\u0254fa w\\\\u0254 m...\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_ak_GH"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.ak_GH": "translation_ak_GH"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-ak_GH": {"config_name": "sentences-ak_GH", "sample_row": "{\"translation.ak_GH\": \"\\\"Salah (nyamefere) ye Mmerebeia\\\"\", \"translation.en_XX\": \"\\\"What he dislikes when fasting (10)\\\"\", \"LASER_similarity\": \"1.4549942\"}", "columns": ["translation_ak_GH", "translation_en_XX", "LASER_similarity"], "columns_mapping": {"translation.ak_GH": "translation_ak_GH", "translation.en_XX": "translation_en_XX", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:af", "language:ak", "language:am", "language:ar", "language:as", "language:ay", "language:az", "language:be", "language:bg", "language:bm", "language:bn", "language:br", "language:bs", "language:ca", "language:ceb", "language:ckb", "language:cs", "language:cy", "language:de", "language:dv", "language:el", "language:eo", "language:es", "language:fa", "language:ff", "language:fi", "language:fo", "language:fr", "language:fy", "language:ga", "language:gl", "language:gn", "language:gu", "language:he", "language:hi", "language:hr", "language:hu", "language:id", "language:ig", "language:is", "language:it", "language:iu", "language:ja", "language:ka", "language:kac", "language:kg", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:ln", "language:lo", "language:lt", "language:lv", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:my", "language:ne", "language:nl", "language:no", "language:nso", "language:ny", "language:om", "language:or", "language:pa", "language:pl", "language:ps", "language:pt", "language:rm", "language:ro", "language:ru", "language:rw", "language:sc", "language:sd", "language:se", "language:shn", "language:si", "language:sk", "language:sl", "language:sn", "language:so", "language:sq", "language:sr", "language:ss", "language:st", "language:su", "language:sv", "language:sw", "language:syc", "language:szl", "language:ta", "language:te", "language:tg", "language:th", "language:ti", "language:tl", "language:tn", "language:tr", "language:ts", "language:tt", "language:ug", "language:uk", "language:ur", "language:uz", "language:ve", "language:vi", "language:war", "language:wo", "language:xh", "language:yi", "language:yo", "language:zgh", "language:zh", "language:zu", "language:zza"], "is_gated": false}, "cdsc": {"dataset_name": "cdsc", "description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.", "downloads": 507, "configs": {"cdsc-e": {"config_name": "cdsc-e", "sample_row": "{\"pair_ID\": \"1\", \"sentence_A\": \"\\\"Ch\\\\u0142opiec w czerwonych trampkach skacze wysok...\", \"sentence_B\": \"\\\"Ch\\\\u0142opiec w bluzce w paski podskakuje wysoko ...\", \"entailment_judgment\": \"0\"}", "columns": ["pair_ID", "sentence_A", "sentence_B", "entailment_judgment"], "columns_mapping": {"pair_ID": "pair_ID", "sentence_A": "sentence_A", "sentence_B": "sentence_B", "entailment_judgment": "entailment_judgment"}, "dataset_description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.\n", "dataset_name": "cdsc"}, "cdsc-r": {"config_name": "cdsc-r", "sample_row": "{\"pair_ID\": \"1\", \"sentence_A\": \"\\\"Ch\\\\u0142opiec w czerwonych trampkach skacze wysok...\", \"sentence_B\": \"\\\"Ch\\\\u0142opiec w bluzce w paski podskakuje wysoko ...\", \"relatedness_score\": \"3.0\"}", "columns": ["pair_ID", "sentence_A", "sentence_B", "relatedness_score"], "columns_mapping": {"pair_ID": "pair_ID", "sentence_A": "sentence_A", "sentence_B": "sentence_B", "relatedness_score": "relatedness_score"}, "dataset_description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.\n", "dataset_name": "cdsc"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl", "sentences entailment and relatedness"], "is_gated": false}, "cdt": {"dataset_name": "cdt", "description": "The Cyberbullying Detection task was part of 2019 edition of PolEval competition. The goal is to predict if a given Twitter message contains a cyberbullying (harmful) content.", "downloads": 356, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"Dla mnie faworytem do tytu\\\\u0142u b\\\\u0119dzie Cra...\", \"target\": \"0\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "The Cyberbullying Detection task was part of 2019 edition of PolEval competition. The goal is to predict if a given Twitter message contains a cyberbullying (harmful) content.\n", "dataset_name": "cdt"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "cedr": {"dataset_name": "cedr", "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.", "downloads": 922, "configs": {"main": {"config_name": "main", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\u0439 \\\\u0433...\", \"labels\": \"[]\", \"source\": \"\\\"lj\\\"\"}", "columns": ["text", "labels", "source"], "columns_mapping": {"text": "text", "labels": "labels", "source": "source"}, "dataset_description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "dataset_name": "cedr"}, "enriched": {"config_name": "enriched", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\u0439 \\\\u0433...\", \"labels\": \"[]\", \"source\": \"\\\"lj\\\"\", \"sentences\": \"[[{\\\"forma\\\": \\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\...\"}", "columns": ["text", "labels", "source", "sentences"], "columns_mapping": {"text": "text", "labels": "labels", "source": "source", "sentences": "sentences"}, "dataset_description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "dataset_name": "cedr"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru", "emotion-classification"], "is_gated": false}, "circa": {"dataset_name": "circa", "description": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems\nto solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with\nannotations for the interpretation of the answer. The data is collected in 10\ndifferent social conversational situations (eg. food preferences of a friend).\n\nNOTE: There might be missing labels in the dataset and we have replaced them with -1.\nThe original dataset contains no train/dev/test splits.", "downloads": 1184, "configs": {"default": {"config_name": "default", "sample_row": "{\"context\": \"\\\"Y has just travelled from a different city to mee...\", \"question-X\": \"\\\"Are you employed?\\\"\", \"canquestion-X\": \"\\\"I am employed .\\\"\", \"answer-Y\": \"\\\"I'm a veterinary technician.\\\"\", \"judgements\": \"\\\"Yes#Yes#Yes#Yes#Yes\\\"\", \"goldstandard1\": \"0\", \"goldstandard2\": \"0\"}", "columns": ["context", "question-X", "canquestion-X", "answer-Y", "judgements", "goldstandard1", "goldstandard2"], "columns_mapping": {"context": "context", "question-X": "question-X", "canquestion-X": "canquestion-X", "answer-Y": "answer-Y", "judgements": "judgements", "goldstandard1": "goldstandard1", "goldstandard2": "goldstandard2"}, "dataset_description": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems\nto solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with\nannotations for the interpretation of the answer. The data is collected in 10\ndifferent social conversational situations (eg. food preferences of a friend).\n\nNOTE: There might be missing labels in the dataset and we have replaced them with -1.\nThe original dataset contains no train/dev/test splits.\n", "dataset_name": "circa"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "question-answer-pair-classification"], "is_gated": false}, "civil_comments": {"dataset_name": "civil_comments", "description": "The comments in this dataset come from an archive of the Civil Comments\nplatform, a commenting plugin for independent news sites. These public comments\nwere created from 2015 - 2017 and appeared on approximately 50 English-language\nnews sites across the world. When Civil Comments shut down in 2017, they chose\nto make the public comments available in a lasting open archive to enable future\nresearch. The original data, published on figshare, includes the public comment\ntext, some associated metadata such as article IDs, timestamps and\ncommenter-generated \"civility\" labels, but does not include user ids. Jigsaw\nextended this dataset by adding additional labels for toxicity and identity\nmentions. This data set is an exact replica of the data released for the\nJigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This\ndataset is released under CC0, as is the underlying comment text.", "downloads": 1244, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"This is so cool. It's like, 'would you want your ...\", \"toxicity\": \"0.0\", \"severe_toxicity\": \"0.0\", \"obscene\": \"0.0\", \"threat\": \"0.0\", \"insult\": \"0.0\", \"identity_attack\": \"0.0\", \"sexual_explicit\": \"0.0\"}", "columns": ["text", "toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack", "sexual_explicit"], "columns_mapping": {"text": "text", "toxicity": "toxicity", "severe_toxicity": "severe_toxicity", "obscene": "obscene", "threat": "threat", "insult": "insult", "identity_attack": "identity_attack", "sexual_explicit": "sexual_explicit"}, "dataset_description": "\nThe comments in this dataset come from an archive of the Civil Comments\nplatform, a commenting plugin for independent news sites. These public comments\nwere created from 2015 - 2017 and appeared on approximately 50 English-language\nnews sites across the world. When Civil Comments shut down in 2017, they chose\nto make the public comments available in a lasting open archive to enable future\nresearch. The original data, published on figshare, includes the public comment\ntext, some associated metadata such as article IDs, timestamps and\ncommenter-generated \"civility\" labels, but does not include user ids. Jigsaw\nextended this dataset by adding additional labels for toxicity and identity\nmentions. This data set is an exact replica of the data released for the\nJigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This\ndataset is released under CC0, as is the underlying comment text.\n", "dataset_name": "civil_comments"}}, "tags": ["language:en"], "is_gated": false}, "clickbait_news_bg": {"dataset_name": "clickbait_news_bg", "description": "Dataset with clickbait and fake news in Bulgarian. Introduced for the Hack the Fake News 2017.", "downloads": 345, "configs": {"default": {"config_name": "default", "sample_row": "{\"fake_news_score\": \"0\", \"click_bait_score\": \"0\", \"content_title\": \"\\\"\\\\u041a\\\\u0430\\\\u043c\\\\u0438\\\\u043b \\\\u0425\\\\u0430\\\\u0431...\", \"content_url\": \"\\\"http://a-specto.bg/kamil-habib-daesh-i-nusra-sa-n...\", \"content_published_time\": \"\\\"2017-05-17 18:35:00\\\"\", \"content\": \"\\\"\\\\u0418\\\\u043d\\\\u0442\\\\u0435\\\\u0440\\\\u0432\\\\u044e \\\\u043d...\"}", "columns": ["fake_news_score", "click_bait_score", "content_title", "content_url", "content_published_time", "content"], "columns_mapping": {"fake_news_score": "fake_news_score", "click_bait_score": "click_bait_score", "content_title": "content_title", "content_url": "content_url", "content_published_time": "content_published_time", "content": "content"}, "dataset_description": "Dataset with clickbait and fake news in Bulgarian. Introduced for the Hack the Fake News 2017.\n", "dataset_name": "clickbait_news_bg"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:bg"], "is_gated": false}, "clinc_oos": {"dataset_name": "clinc_oos", "description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".", "downloads": 2021, "configs": {"small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"can you walk me through setting up direct deposit...\", \"intent\": \"108\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nSmall, in which there are only 50 training queries per each in-scope intent\n", "dataset_name": "clinc_oos"}, "imbalanced": {"config_name": "imbalanced", "sample_row": "{\"text\": \"\\\"what are the steps for setting up direct deposit ...\", \"intent\": \"108\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nImbalanced, in which intents have either 25, 50, 75, or 100 training queries.\n", "dataset_name": "clinc_oos"}, "plus": {"config_name": "plus", "sample_row": "{\"text\": \"\\\"what expression would i use to say i love you if ...\", \"intent\": \"61\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nOOS+, in which there are 250 out-of-scope training examples, rather than 100.\n", "dataset_name": "clinc_oos"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cmu_hinglish_dog": {"dataset_name": "cmu_hinglish_dog", "description": "This is a collection of text conversations in Hinglish (code mixing between Hindi-English) and their corresponding English only versions. Can be used for Translating between the two.", "downloads": 506, "configs": {"default": {"config_name": "default", "sample_row": "{\"date\": \"\\\"2018-03-21T23:01:52.359Z\\\"\", \"docIdx\": \"0\", \"translation.hi_en\": \"\\\"HELLO, KYA AAP KO MOVIES PASAND HEIN?\\\"\", \"translation.en\": \"\\\"Hello. Do you like movies?\\\"\", \"uid\": \"\\\"user2\\\"\", \"utcTimestamp\": \"\\\"2018-03-21T23:02:56.623Z\\\"\", \"rating\": \"2\", \"status\": \"1\", \"uid1LogInTime\": \"\\\"2018-03-21T23:01:52.359Z\\\"\", \"uid1LogOutTime\": \"\\\"2018-03-21T23:23:16.414Z\\\"\", \"uid1response.response\": \"[1, 2, 3, 5]\", \"uid1response.type\": \"\\\"finish\\\"\", \"uid2response.response\": \"[2, 3, 4]\", \"uid2response.type\": \"\\\"finish\\\"\", \"user2_id\": \"\\\"USR3699\\\"\", \"whoSawDoc\": \"[\\\"user1\\\", \\\"user2\\\"]\", \"wikiDocumentIdx\": \"24\"}", "columns": ["date", "docIdx", "translation_hi_en", "translation_en", "uid", "utcTimestamp", "rating", "status", "uid1LogInTime", "uid1LogOutTime", "uid1response_response", "uid1response_type", "uid2response_response", "uid2response_type", "user2_id", "whoSawDoc", "wikiDocumentIdx"], "columns_mapping": {"date": "date", "docIdx": "docIdx", "translation.hi_en": "translation_hi_en", "translation.en": "translation_en", "uid": "uid", "utcTimestamp": "utcTimestamp", "rating": "rating", "status": "status", "uid1LogInTime": "uid1LogInTime", "uid1LogOutTime": "uid1LogOutTime", "uid1response.response": "uid1response_response", "uid1response.type": "uid1response_type", "uid2response.response": "uid2response_response", "uid2response.type": "uid2response_type", "user2_id": "user2_id", "whoSawDoc": "whoSawDoc", "wikiDocumentIdx": "wikiDocumentIdx"}, "dataset_description": "This is a collection of text conversations in Hinglish (code mixing between Hindi-English) and their corresponding English only versions. Can be used for Translating between the two.\n", "dataset_name": "cmu_hinglish_dog"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "multilinguality:multilingual", "multilinguality:translation", "source_datasets:original", "language:en", "language:hi"], "is_gated": false}, "cnn_dailymail": {"dataset_name": "cnn_dailymail", "description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "downloads": 86023, "configs": {"3.0.0": {"config_name": "3.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}, "1.0.0": {"config_name": "1.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}, "2.0.0": {"config_name": "2.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "coarse_discourse": {"dataset_name": "coarse_discourse", "description": "dataset contains discourse annotation and relation on threads from reddit during 2016", "downloads": 449, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"DTX120: #87 - Nashville\\\"\", \"is_self_post\": \"true\", \"subreddit\": \"\\\"100movies365days\\\"\", \"url\": \"\\\"https://www.reddit.com/r/100movies365days/comment...\", \"majority_link\": \"\\\"none\\\"\", \"is_first_post\": \"false\", \"majority_type\": \"\\\"announcement\\\"\", \"id_post\": \"\\\"t3_1bx6qw\\\"\", \"post_depth\": \"-1\", \"in_reply_to\": \"\\\"\\\"\", \"annotations.annotator\": \"[\\\"fc96a15ab87f02dd1998ff55a64f6478\\\", \\\"e9e4b3ab3551...\", \"annotations.link_to_post\": \"[\\\"\\\", \\\"\\\", \\\"\\\"]\", \"annotations.main_type\": \"[\\\"announcement\\\", \\\"announcement\\\", \\\"announcement\\\"]...\"}", "columns": ["title", "is_self_post", "subreddit", "url", "majority_link", "is_first_post", "majority_type", "id_post", "post_depth", "in_reply_to", "annotations_annotator", "annotations_link_to_post", "annotations_main_type"], "columns_mapping": {"title": "title", "is_self_post": "is_self_post", "subreddit": "subreddit", "url": "url", "majority_link": "majority_link", "is_first_post": "is_first_post", "majority_type": "majority_type", "id_post": "id_post", "post_depth": "post_depth", "in_reply_to": "in_reply_to", "annotations.annotator": "annotations_annotator", "annotations.link_to_post": "annotations_link_to_post", "annotations.main_type": "annotations_main_type"}, "dataset_description": "dataset contains discourse annotation and relation on threads from reddit during 2016\n", "dataset_name": "coarse_discourse"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "codah": {"dataset_name": "codah", "description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.", "downloads": 1636, "configs": {"codah": {"config_name": "codah", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"I am always very hungry before I go to bed. I am\\\"...\", \"candidate_answers\": \"[\\\"concerned that this is an illness.\\\", \\\"glad that ...\", \"correct_answer_idx\": \"3\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_0": {"config_name": "fold_0", "sample_row": "{\"id\": \"0\", \"question_category\": \"3\", \"question_propmt\": \"\\\"The chicken cannot fly. It\\\"\", \"candidate_answers\": \"[\\\"flies.\\\", \\\"spreads its wings and flies.\\\", \\\"crosse...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_1": {"config_name": "fold_1", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_2": {"config_name": "fold_2", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_3": {"config_name": "fold_3", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_4": {"config_name": "fold_4", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "code_search_net": {"dataset_name": "code_search_net", "description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.", "downloads": 69261, "configs": {"all": {"config_name": "all", "sample_row": "{\"repository_name\": \"\\\"ageitgey/face_recognition\\\"\", \"func_path_in_repository\": \"\\\"examples/face_recognition_knn.py\\\"\", \"func_name\": \"\\\"train\\\"\", \"whole_func_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"language\": \"\\\"python\\\"\", \"func_code_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"train\\\", \\\"(\\\", \\\"train_dir\\\", \\\",\\\", \\\"model_sav...\", \"func_documentation_string\": \"\\\"Trains a k-nearest neighbors classifier for face ...\", \"func_documentation_tokens\": \"[\\\"Trains\\\", \\\"a\\\", \\\"k\\\", \\\"-\\\", \\\"nearest\\\", \\\"neighbors\\\", ...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/ageitgey/face_recognition/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "java": {"config_name": "java", "sample_row": "{\"repository_name\": \"\\\"spring-projects/spring-boot\\\"\", \"func_path_in_repository\": \"\\\"spring-boot-project/spring-boot/src/main/java/org...\", \"func_name\": \"\\\"IndexedElementsBinder.bindIndexed\\\"\", \"whole_func_string\": \"\\\"protected final void bindIndexed(ConfigurationPro...\", \"language\": \"\\\"java\\\"\", \"func_code_string\": \"\\\"protected final void bindIndexed(ConfigurationPro...\", \"func_code_tokens\": \"[\\\"protected\\\", \\\"final\\\", \\\"void\\\", \\\"bindIndexed\\\", \\\"(\\\",...\", \"func_documentation_string\": \"\\\"Bind indexed elements to the supplied collection....\", \"func_documentation_tokens\": \"[\\\"Bind\\\", \\\"indexed\\\", \\\"elements\\\", \\\"to\\\", \\\"the\\\", \\\"supp...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/spring-projects/spring-boot/bl...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "go": {"config_name": "go", "sample_row": "{\"repository_name\": \"\\\"kubernetes/kubernetes\\\"\", \"func_path_in_repository\": \"\\\"staging/src/k8s.io/apimachinery/pkg/runtime/exten...\", \"func_name\": \"\\\"MarshalJSON\\\"\", \"whole_func_string\": \"\\\"func (re RawExtension) MarshalJSON() ([]byte, err...\", \"language\": \"\\\"go\\\"\", \"func_code_string\": \"\\\"func (re RawExtension) MarshalJSON() ([]byte, err...\", \"func_code_tokens\": \"[\\\"func\\\", \\\"(\\\", \\\"re\\\", \\\"RawExtension\\\", \\\")\\\", \\\"MarshalJ...\", \"func_documentation_string\": \"\\\"// MarshalJSON may get called on pointers or valu...\", \"func_documentation_tokens\": \"[\\\"MarshalJSON\\\", \\\"may\\\", \\\"get\\\", \\\"called\\\", \\\"on\\\", \\\"poi...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/kubernetes/kubernetes/blob/6a8...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "python": {"config_name": "python", "sample_row": "{\"repository_name\": \"\\\"ageitgey/face_recognition\\\"\", \"func_path_in_repository\": \"\\\"examples/face_recognition_knn.py\\\"\", \"func_name\": \"\\\"train\\\"\", \"whole_func_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"language\": \"\\\"python\\\"\", \"func_code_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"train\\\", \\\"(\\\", \\\"train_dir\\\", \\\",\\\", \\\"model_sav...\", \"func_documentation_string\": \"\\\"Trains a k-nearest neighbors classifier for face ...\", \"func_documentation_tokens\": \"[\\\"Trains\\\", \\\"a\\\", \\\"k\\\", \\\"-\\\", \\\"nearest\\\", \\\"neighbors\\\", ...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/ageitgey/face_recognition/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "javascript": {"config_name": "javascript", "sample_row": "{\"repository_name\": \"\\\"Microsoft/vscode\\\"\", \"func_path_in_repository\": \"\\\"build/lib/treeshaking.js\\\"\", \"func_name\": \"\\\"createTypeScriptLanguageService\\\"\", \"whole_func_string\": \"\\\"function createTypeScriptLanguageService(options)...\", \"language\": \"\\\"javascript\\\"\", \"func_code_string\": \"\\\"function createTypeScriptLanguageService(options)...\", \"func_code_tokens\": \"[\\\"function\\\", \\\"createTypeScriptLanguageService\\\", \\\"(...\", \"func_documentation_string\": \"\\\"#region Discovery, LanguageService & Setup\\\"\", \"func_documentation_tokens\": \"[\\\"#region\\\", \\\"Discovery\\\", \\\"LanguageService\\\", \\\"&\\\", \\\"...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/Microsoft/vscode/blob/693a13cd...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "ruby": {"config_name": "ruby", "sample_row": "{\"repository_name\": \"\\\"rails/rails\\\"\", \"func_path_in_repository\": \"\\\"activesupport/lib/active_support/current_attribut...\", \"func_name\": \"\\\"ActiveSupport.CurrentAttributes.set\\\"\", \"whole_func_string\": \"\\\"def set(set_attributes)\\\\n old_attributes = c...\", \"language\": \"\\\"ruby\\\"\", \"func_code_string\": \"\\\"def set(set_attributes)\\\\n old_attributes = c...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"set\\\", \\\"(\\\", \\\"set_attributes\\\", \\\")\\\", \\\"old_at...\", \"func_documentation_string\": \"\\\"Expose one or more attributes within a block. Old...\", \"func_documentation_tokens\": \"[\\\"Expose\\\", \\\"one\\\", \\\"or\\\", \\\"more\\\", \\\"attributes\\\", \\\"wit...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/rails/rails/blob/85a8bc644be69...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "php": {"config_name": "php", "sample_row": "{\"repository_name\": \"\\\"domnikl/DesignPatternsPHP\\\"\", \"func_path_in_repository\": \"\\\"Structural/Registry/Registry.php\\\"\", \"func_name\": \"\\\"Registry.set\\\"\", \"whole_func_string\": \"\\\"public static function set(string $key, $value)\\\\n...\", \"language\": \"\\\"php\\\"\", \"func_code_string\": \"\\\"public static function set(string $key, $value)\\\\n...\", \"func_code_tokens\": \"[\\\"public\\\", \\\"static\\\", \\\"function\\\", \\\"set\\\", \\\"(\\\", \\\"stri...\", \"func_documentation_string\": \"\\\"@param string $key\\\\n@param mixed $value\\\\n\\\\n@retu...\", \"func_documentation_tokens\": \"[\\\"@param\\\", \\\"string\\\", \\\"$key\\\", \\\"@param\\\", \\\"mixed\\\", \\\"$...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/domnikl/DesignPatternsPHP/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_clone_detection_big_clone_bench": {"dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench", "description": "Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "downloads": 648, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"id1\": \"13988825\", \"id2\": \"8660836\", \"func1\": \"\\\" private void setNodekeyInJsonResponse(String ...\", \"func2\": \"\\\" public void transform(String style, String sp...\", \"label\": \"false\"}", "columns": ["id", "id1", "id2", "func1", "func2", "label"], "columns_mapping": {"id": "id", "id1": "id1", "id2": "id2", "func1": "func1", "func2": "func2", "label": "label"}, "dataset_description": "CodeXGLUE Clone-detection-BigCloneBench dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench\n\nGiven two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_cloze_testing_all": {"dataset_name": "code_x_glue_cc_cloze_testing_all", "description": "Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "downloads": 1031, "configs": {"go": {"config_name": "go", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"MarshalJSON\\\", \\\"supports\\\", \\\"json\\\", \\\".\\\", \\\"Marshale...\", \"pl_tokens\": \"[\\\"func\\\", \\\"(\\\", \\\"v\\\", \\\"ContextRealtimeData\\\", \\\")\\\", \\\"Ma...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"/\\\", \\\"*\\\", \\\"(\\\", \\\"non\\\", \\\"-\\\", \\\"Javadoc\\\", \\\")\\\"]\", \"pl_tokens\": \"[\\\"@\\\", \\\"Override\\\", \\\"public\\\", \\\"int\\\", \\\"peekBit\\\", \\\"(\\\",...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "javascript": {"config_name": "javascript", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Cast\\\", \\\"query\\\", \\\"params\\\", \\\"according\\\", \\\"to\\\", \\\"ty...\", \"pl_tokens\": \"[\\\"function\\\", \\\"castQueryParams\\\", \\\"(\\\", \\\"relId\\\", \\\",\\\",...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "php": {"config_name": "php", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Get\\\", \\\"choices\\\", \\\".\\\"]\", \"pl_tokens\": \"[\\\"protected\\\", \\\"\\\", \\\"getChoices\\\", \\\"(\\\", \\\"FormFi...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Post\\\", \\\"a\\\", \\\"review\\\"]\", \"pl_tokens\": \"[\\\"def\\\", \\\"post_review\\\", \\\"(\\\", \\\"session\\\", \\\",\\\", \\\"revie...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "ruby": {"config_name": "ruby", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"By\\\", \\\"default\\\", \\\"taskers\\\", \\\"don\\\", \\\"t\\\", \\\"see\\\", \\\"t...\", \"pl_tokens\": \"[\\\"def\\\", \\\"gather_vars\\\", \\\"(\\\", \\\"executor\\\", \\\",\\\", \\\"tcon...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_completion_line": {"dataset_name": "code_x_glue_cc_code_completion_line", "description": "Complete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "downloads": 445, "configs": {"java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"input\": \"\\\" package org . rubypeople . rdt . internal . u...\", \"gt\": \"\\\"\\\"\"}", "columns": ["id", "input", "gt"], "columns_mapping": {"id": "id", "input": "input", "gt": "gt"}, "dataset_description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "dataset_name": "code_x_glue_cc_code_completion_line"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"input\": \"\\\" from __future__ import absolute_import ...\", \"gt\": \"\\\"\\\"\"}", "columns": ["id", "input", "gt"], "columns_mapping": {"id": "id", "input": "input", "gt": "gt"}, "dataset_description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "dataset_name": "code_x_glue_cc_code_completion_line"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_completion_token": {"dataset_name": "code_x_glue_cc_code_completion_token", "description": "Predict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.", "downloads": 457, "configs": {"java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"code\": \"[\\\"\\\", \\\"package\\\", \\\"org\\\", \\\".\\\", \\\"sqlproc\\\", \\\".\\\", \\\"ds...\"}", "columns": ["id", "code"], "columns_mapping": {"id": "id", "code": "code"}, "dataset_description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "dataset_name": "code_x_glue_cc_code_completion_token"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"path\": \"\\\"00/wikihouse/urls.py\\\\n\\\"\", \"code\": \"[\\\"\\\", \\\"from\\\", \\\"bootstrap\\\", \\\"import\\\", \\\"Bootstrap\\\"...\"}", "columns": ["id", "path", "code"], "columns_mapping": {"id": "id", "path": "path", "code": "code"}, "dataset_description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "dataset_name": "code_x_glue_cc_code_completion_token"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_refinement": {"dataset_name": "code_x_glue_cc_code_refinement", "description": "We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "downloads": 647, "configs": {"medium": {"config_name": "medium", "sample_row": "{\"id\": \"0\", \"buggy\": \"\\\"public static TYPE_1 init ( java.lang.String name...\", \"fixed\": \"\\\"public static TYPE_1 init ( java.lang.String name...\"}", "columns": ["id", "buggy", "fixed"], "columns_mapping": {"id": "id", "buggy": "buggy", "fixed": "fixed"}, "dataset_description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "dataset_name": "code_x_glue_cc_code_refinement"}, "small": {"config_name": "small", "sample_row": "{\"id\": \"0\", \"buggy\": \"\\\"public java.lang.String METHOD_1 ( ) { return new...\", \"fixed\": \"\\\"public java.lang.String METHOD_1 ( ) { return new...\"}", "columns": ["id", "buggy", "fixed"], "columns_mapping": {"id": "id", "buggy": "buggy", "fixed": "fixed"}, "dataset_description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "dataset_name": "code_x_glue_cc_code_refinement"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:expert-generated", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "debugging"], "is_gated": false}, "code_x_glue_cc_code_to_code_trans": {"dataset_name": "code_x_glue_cc_code_to_code_trans", "description": "The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "downloads": 566, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"java\": \"\\\"public ListSpeechSynthesisTasksResult listSpeechS...\", \"cs\": \"\\\"public virtual ListSpeechSynthesisTasksResponse L...\"}", "columns": ["id", "java", "cs"], "columns_mapping": {"id": "id", "java": "java", "cs": "cs"}, "dataset_description": "CodeXGLUE code-to-code-trans dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans\n\nThe dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "dataset_name": "code_x_glue_cc_code_to_code_trans"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "code-to-code"], "is_gated": false}, "code_x_glue_cc_defect_detection": {"dataset_name": "code_x_glue_cc_defect_detection", "description": "Given a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "downloads": 526, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"func\": \"\\\"static av_cold int vdadec_init(AVCodecContext *av...\", \"target\": \"false\", \"project\": \"\\\"FFmpeg\\\"\", \"commit_id\": \"\\\"973b1a6b9070e2bf17d17568cbaf4043ce931f51\\\"\"}", "columns": ["id", "func", "target", "project", "commit_id"], "columns_mapping": {"id": "id", "func": "func", "target": "target", "project": "project", "commit_id": "commit_id"}, "dataset_description": "CodeXGLUE Defect-detection dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection\n\nGiven a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "dataset_name": "code_x_glue_cc_defect_detection"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:found", "multilinguality:other-programming-languages", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_tc_text_to_code": {"dataset_name": "code_x_glue_tc_text_to_code", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "downloads": 1246, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"nl\": \"\\\"check if details are parsed . concode_field_sep C...\", \"code\": \"\\\"boolean function ( ) { return isParsed ; }\\\"\"}", "columns": ["id", "nl", "code"], "columns_mapping": {"id": "id", "nl": "nl", "code": "code"}, "dataset_description": "CodeXGLUE text-to-code dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/text-to-code\n\nWe use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "dataset_name": "code_x_glue_tc_text_to_code"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "language:en", "text-to-code"], "is_gated": false}, "code_x_glue_tt_text_to_text": {"dataset_name": "code_x_glue_tt_text_to_text", "description": "The dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "downloads": 821, "configs": {"da_en": {"config_name": "da_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : " Oversigt over ops\\\\u00e6tninger for...\", \"target\": \"\\\"title : Overview of Setups for Service Items and ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "lv_en": {"config_name": "lv_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : Pakalpojumu objektu izveide\\\\n\\\"\", \"target\": \"\\\"title : Create service objects\\\\n\\\"\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "no_en": {"config_name": "no_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : Oversikt over oppsett av servicevarer og ...\", \"target\": \"\\\"title : Overview of Setups for Service Items and ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "zh_en": {"config_name": "zh_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"\\\\u4ee5\\\\u4e0b \\\\u547d\\\\u540d \\\\u7a7a\\\\u95f4 \\\\u5305\\\\u54...\", \"target\": \"\\\"The following namespaces contain APIs that allow ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:da", "language:en", "language:lv", "language:nb", "language:zh", "code-documentation-translation"], "is_gated": false}, "com_qa": {"dataset_name": "com_qa", "description": "ComQA is a dataset of 11,214 questions, which were collected from WikiAnswers, a community question answering website.\nBy collecting questions from such a site we ensure that the information needs are ones of interest to actual users.\nMoreover, questions posed there are often cannot be answered by commercial search engines or QA technology, making them\nmore interesting for driving future research compared to those collected from an engine's query log. The dataset contains\nquestions with various challenging phenomena such as the need for temporal reasoning, comparison (e.g., comparatives,\nsuperlatives, ordinals), compositionality (multiple, possibly nested, subquestions with multiple entities), and\nunanswerable questions (e.g., Who was the first human being on Mars?). Through a large crowdsourcing effort, questions\nin ComQA are grouped into 4,834 paraphrase clusters that express the same information need. Each cluster is annotated\nwith its answer(s). ComQA answers come in the form of Wikipedia entities wherever possible. Wherever the answers are\ntemporal or measurable quantities, TIMEX3 and the International System of Units (SI) are used for normalization.", "downloads": 334, "configs": {"default": {"config_name": "default", "sample_row": "{\"cluster_id\": \"\\\"cluster-1754\\\"\", \"questions\": \"[\\\"what years did cale yarborough win his cup champ...\", \"answers\": \"[\\\"1976\\\", \\\"1978\\\", \\\"1977\\\"]\"}", "columns": ["cluster_id", "questions", "answers"], "columns_mapping": {"cluster_id": "cluster_id", "questions": "questions", "answers": "answers"}, "dataset_description": "ComQA is a dataset of 11,214 questions, which were collected from WikiAnswers, a community question answering website.\nBy collecting questions from such a site we ensure that the information needs are ones of interest to actual users.\nMoreover, questions posed there are often cannot be answered by commercial search engines or QA technology, making them\nmore interesting for driving future research compared to those collected from an engine's query log. The dataset contains\nquestions with various challenging phenomena such as the need for temporal reasoning, comparison (e.g., comparatives,\nsuperlatives, ordinals), compositionality (multiple, possibly nested, subquestions with multiple entities), and\nunanswerable questions (e.g., Who was the first human being on Mars?). Through a large crowdsourcing effort, questions\nin ComQA are grouped into 4,834 paraphrase clusters that express the same information need. Each cluster is annotated\nwith its answer(s). ComQA answers come in the form of Wikipedia entities wherever possible. Wherever the answers are\ntemporal or measurable quantities, TIMEX3 and the International System of Units (SI) are used for normalization.\n", "dataset_name": "com_qa"}}, "tags": ["task_categories:question-answering", "language:en"], "is_gated": false}, "common_gen": {"dataset_name": "common_gen", "description": "CommonGen is a constrained text generation task, associated with a benchmark dataset,\nto explicitly test machines for the ability of generative commonsense reasoning. Given\na set of common concepts; the task is to generate a coherent sentence describing an\neveryday scenario using these concepts.\n\nCommonGen is challenging because it inherently requires 1) relational reasoning using\nbackground commonsense knowledge, and 2) compositional generalization ability to work\non unseen concept combinations. Our dataset, constructed through a combination of\ncrowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and\n50k sentences in total.", "downloads": 5845, "configs": {"default": {"config_name": "default", "sample_row": "{\"concept_set_idx\": \"0\", \"concepts\": \"[\\\"ski\\\", \\\"mountain\\\", \\\"skier\\\"]\", \"target\": \"\\\"Skier skis down the mountain\\\"\"}", "columns": ["concept_set_idx", "concepts", "target"], "columns_mapping": {"concept_set_idx": "concept_set_idx", "concepts": "concepts", "target": "target"}, "dataset_description": "CommonGen is a constrained text generation task, associated with a benchmark dataset,\nto explicitly test machines for the ability of generative commonsense reasoning. Given\na set of common concepts; the task is to generate a coherent sentence describing an\neveryday scenario using these concepts.\n\nCommonGen is challenging because it inherently requires 1) relational reasoning using\nbackground commonsense knowledge, and 2) compositional generalization ability to work\non unseen concept combinations. Our dataset, constructed through a combination of\ncrowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and\n50k sentences in total.\n", "dataset_name": "common_gen"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "concepts-to-text"], "is_gated": false}, "commonsense_qa": {"dataset_name": "commonsense_qa", "description": "CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge\nto predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits: \"Random split\" which is the main evaluation\nsplit, and \"Question token split\", see paper for details.", "downloads": 29428, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"075e483d21c29a511267ef62bedc0461\\\"\", \"question\": \"\\\"The sanctions against the school were a punishing...\", \"question_concept\": \"\\\"punishing\\\"\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\", \\\"E\\\"]\", \"choices.text\": \"[\\\"ignore\\\", \\\"enforce\\\", \\\"authoritarian\\\", \\\"yell at\\\", ...\", \"answerKey\": \"\\\"A\\\"\"}", "columns": ["id", "question", "question_concept", "choices_label", "choices_text", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "question_concept": "question_concept", "choices.label": "choices_label", "choices.text": "choices_text", "answerKey": "answerKey"}, "dataset_description": "CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge\nto predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits: \"Random split\" which is the main evaluation\nsplit, and \"Question token split\", see paper for details.\n", "dataset_name": "commonsense_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "conceptnet5": {"dataset_name": "conceptnet5", "description": "This dataset is designed to provide training data\r\nfor common sense relationships pulls together from various sources.\r\n\r\nThe dataset is multi-lingual. See langauge codes and language info\r\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\r\n\r\n\r\nThis dataset provides an interface for the conceptnet5 csv file, and\r\nsome (but not all) of the raw text data used to build conceptnet5:\r\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\r\n\r\nOne use of this dataset would be to learn to extract the conceptnet\r\nrelationship from the omcsnet sentences.\r\n\r\nConceptnet5 has 34,074,917 relationships. Of those relationships,\r\nthere are 2,176,099 surface text sentences related to those 2M\r\nentries.\r\n\r\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\r\n2,001,736 lines.\r\n\r\nOriginal downloads are available here\r\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\r\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\r\n\r\nThe omcsnet data comes with the following warning from the authors of\r\nthe above site: Remember: this data comes from various forms of\r\ncrowdsourcing. Sentences in these files are not necessarily true,\r\nuseful, or appropriate.", "downloads": 713, "configs": {"conceptnet5": {"config_name": "conceptnet5", "sample_row": "{\"sentence\": \"\\\"\\\"\", \"full_rel\": \"\\\"/a/[/r/Antonym/,/c/ab/\\\\u0430\\\\u0433\\\\u044b\\\\u0440\\\\u0...\", \"rel\": \"\\\"/r/Antonym\\\"\", \"arg1\": \"\\\"/c/ab/\\\\u0430\\\\u0433\\\\u044b\\\\u0440\\\\u0443\\\\u0430/n\\\"\", \"arg2\": \"\\\"/c/ab/\\\\u0430\\\\u04a7\\\\u0441\\\\u0443\\\\u0430\\\"\", \"lang\": \"\\\"ab\\\"\", \"extra_info\": \"\\\"{\\\\\\\"dataset\\\\\\\": \\\\\\\"/d/wiktionary/en\\\\\\\", \\\\\\\"license\\\\\\\": ...\", \"weight\": \"1.0\"}", "columns": ["sentence", "full_rel", "rel", "arg1", "arg2", "lang", "extra_info", "weight"], "columns_mapping": {"sentence": "sentence", "full_rel": "full_rel", "rel": "rel", "arg1": "arg1", "arg2": "arg2", "lang": "lang", "extra_info": "extra_info", "weight": "weight"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}, "omcs_sentences_free": {"config_name": "omcs_sentences_free", "sample_row": "{\"sentence\": \"\\\"text\\\"\", \"raw_data\": \"\\\"id\\\\ttext\\\\tcreator_id\\\\tcreated_on\\\\tlanguage_id\\\\tac...\", \"lang\": \"\\\"language_id\\\"\"}", "columns": ["sentence", "raw_data", "lang"], "columns_mapping": {"sentence": "sentence", "raw_data": "raw_data", "lang": "lang"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}, "omcs_sentences_more": {"config_name": "omcs_sentences_more", "sample_row": "{\"sentence\": \"\\\"text\\\"\", \"raw_data\": \"\\\"id\\\\ttext\\\\tcreator_id\\\\tcreated_on\\\\tlanguage_id\\\\tac...\", \"lang\": \"\\\"language_id\\\"\"}", "columns": ["sentence", "raw_data", "lang"], "columns_mapping": {"sentence": "sentence", "raw_data": "raw_data", "lang": "lang"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:it", "language:ja", "language:nl", "language:pt", "language:ru", "language:zh"], "is_gated": false}, "conll2000": {"dataset_name": "conll2000", "description": " Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence\n He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:\n[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ]\n[PP in ] [NP September ] .\n\nText chunking is an intermediate step towards full parsing. It was the shared task for CoNLL-2000. Training and test\ndata for this task is available. This data consists of the same partitions of the Wall Street Journal corpus (WSJ)\nas the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as\ntest data (47377 tokens). The annotation of the data has been derived from the WSJ corpus by a program written by\nSabine Buchholz from Tilburg University, The Netherlands.", "downloads": 327, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Confidence\\\", \\\"in\\\", \\\"the\\\", \\\"pound\\\", \\\"is\\\", \\\"widely...\", \"pos_tags\": \"[19, 14, 11, 19, 39, 27, 37, 32, 34, 11, 15, 19, 1...\", \"chunk_tags\": \"[11, 13, 11, 12, 21, 22, 22, 22, 22, 11, 12, 12, 1...\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags"}, "dataset_description": " Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence\n He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:\n[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ]\n[PP in ] [NP September ] .\n\nText chunking is an intermediate step towards full parsing. It was the shared task for CoNLL-2000. Training and test\ndata for this task is available. This data consists of the same partitions of the Wall Street Journal corpus (WSJ)\nas the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as\ntest data (47377 tokens). The annotation of the data has been derived from the WSJ corpus by a program written by\nSabine Buchholz from Tilburg University, The Netherlands.\n", "dataset_name": "conll2000"}}, "tags": ["language:en"], "is_gated": false}, "conll2002": {"dataset_name": "conll2002", "description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/", "downloads": 862, "configs": {"es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Melbourne\\\", \\\"(\\\", \\\"Australia\\\", \\\")\\\", \\\",\\\", \\\"25\\\", \\\"m...\", \"pos_tags\": \"[29, 21, 29, 22, 13, 59, 28, 21, 28, 22, 20]\", \"ner_tags\": \"[5, 0, 5, 0, 0, 0, 0, 0, 3, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/\n", "dataset_name": "conll2002"}, "nl": {"config_name": "nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"De\\\", \\\"tekst\\\", \\\"van\\\", \\\"het\\\", \\\"arrest\\\", \\\"is\\\", \\\"nog...\", \"pos_tags\": \"[2, 6, 8, 2, 6, 11, 1, 1, 0, 0, 3, 2, 6, 11, 1, 11...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/\n", "dataset_name": "conll2002"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:crowdsourced", "multilinguality:multilingual", "source_datasets:original", "language:es", "language:nl"], "is_gated": false}, "conll2003": {"dataset_name": "conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "downloads": 77284, "configs": {"conll2003": {"config_name": "conll2003", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"pos_tags\": \"[22, 42, 16, 21, 35, 37, 16, 21, 7]\", \"chunk_tags\": \"[11, 21, 11, 12, 21, 22, 11, 12, 0]\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags", "ner_tags": "ner_tags"}, "dataset_description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "dataset_name": "conll2003"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other-reuters-corpus", "language:en"], "is_gated": false}, "conllpp": {"dataset_name": "conllpp", "description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh", "downloads": 1229, "configs": {"conllpp": {"config_name": "conllpp", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"pos_tags\": \"[22, 42, 16, 21, 35, 37, 16, 21, 7]\", \"chunk_tags\": \"[11, 21, 11, 12, 21, 22, 11, 12, 0]\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags", "ner_tags": "ner_tags"}, "dataset_description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh\n", "dataset_name": "conllpp"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|conll2003", "language:en"], "is_gated": false}, "conv_ai_3": {"dataset_name": "conv_ai_3", "description": "The Conv AI 3 challenge is organized as part of the Search-oriented Conversational AI (SCAI) EMNLP workshop in 2020. The main aim of the conversational systems is to return an appropriate answer in response to the user requests. However, some user requests might be ambiguous. In Information Retrieval (IR) settings such a situation is handled mainly through the diversification of search result page. It is however much more challenging in dialogue settings. Hence, we aim to study the following situation for dialogue settings:\n- a user is asking an ambiguous question (where ambiguous question is a question to which one can return > 1 possible answers)\n- the system must identify that the question is ambiguous, and, instead of trying to answer it directly, ask a good clarifying question.", "downloads": 396, "configs": {"conv_ai_3": {"config_name": "conv_ai_3", "sample_row": "{\"topic_id\": \"1\", \"initial_request\": \"\\\"Tell me about Obama family tree.\\\"\", \"topic_desc\": \"\\\"Find information on President Barack Obama\\\\\\\\'s fa...\", \"clarification_need\": \"2\", \"facet_id\": \"\\\"F0001\\\"\", \"facet_desc\": \"\\\"Find the TIME magazine photo essay \\\\\\\"Barack Obama...\", \"question_id\": \"\\\"Q00384\\\"\", \"question\": \"\\\"are you interested in seeing barack obamas family...\", \"answer\": \"\\\"yes am interested in obamas family\\\"\"}", "columns": ["topic_id", "initial_request", "topic_desc", "clarification_need", "facet_id", "facet_desc", "question_id", "question", "answer"], "columns_mapping": {"topic_id": "topic_id", "initial_request": "initial_request", "topic_desc": "topic_desc", "clarification_need": "clarification_need", "facet_id": "facet_id", "facet_desc": "facet_desc", "question_id": "question_id", "question": "question", "answer": "answer"}, "dataset_description": "The Conv AI 3 challenge is organized as part of the Search-oriented Conversational AI (SCAI) EMNLP workshop in 2020. The main aim of the conversational systems is to return an appropriate answer in response to the user requests. However, some user requests might be ambiguous. In Information Retrieval (IR) settings such a situation is handled mainly through the diversification of search result page. It is however much more challenging in dialogue settings. Hence, we aim to study the following situation for dialogue settings:\n- a user is asking an ambiguous question (where ambiguous question is a question to which one can return > 1 possible answers)\n- the system must identify that the question is ambiguous, and, instead of trying to answer it directly, ask a good clarifying question.\n", "dataset_name": "conv_ai_3"}}, "tags": ["task_categories:conversational", "task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "evaluating-dialogue-systems"], "is_gated": false}, "conv_questions": {"dataset_name": "conv_questions", "description": "ConvQuestions is the first realistic benchmark for conversational question answering over knowledge graphs.\nIt contains 11,200 conversations which can be evaluated over Wikidata. The questions feature a variety of complex\nquestion phenomena like comparisons, aggregations, compositionality, and temporal reasoning.", "downloads": 332, "configs": {"default": {"config_name": "default", "sample_row": "{\"domain\": \"\\\"music\\\"\", \"seed_entity\": \"\\\"https://www.wikidata.org/wiki/Q223495\\\"\", \"seed_entity_text\": \"\\\"The Carpenters\\\"\", \"questions\": \"[\\\"When did The Carpenters sign with A&M Records?\\\",...\", \"answers\": \"[[\\\"1969\\\"], [\\\"https://www.wikidata.org/wiki/Q928282...\", \"answer_texts\": \"[\\\"1969\\\", \\\"(They Long to Be) Close to You\\\", \\\"1983\\\",...\"}", "columns": ["domain", "seed_entity", "seed_entity_text", "questions", "answers", "answer_texts"], "columns_mapping": {"domain": "domain", "seed_entity": "seed_entity", "seed_entity_text": "seed_entity_text", "questions": "questions", "answers": "answers", "answer_texts": "answer_texts"}, "dataset_description": "ConvQuestions is the first realistic benchmark for conversational question answering over knowledge graphs.\nIt contains 11,200 conversations which can be evaluated over Wikidata. The questions feature a variety of complex\nquestion phenomena like comparisons, aggregations, compositionality, and temporal reasoning.", "dataset_name": "conv_questions"}}, "tags": ["task_categories:question-answering", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:open-domain-qa", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cos_e": {"dataset_name": "cos_e", "description": "Common Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.", "downloads": 3784, "configs": {"v1.0": {"config_name": "v1.0", "sample_row": "{\"id\": \"\\\"d3b479933e716fb388dfb297e881054c\\\"\", \"question\": \"\\\"If a lantern is not for sale, where is it likely ...\", \"choices\": \"[\\\"antique shop\\\", \\\"house\\\", \\\"dark place\\\"]\", \"answer\": \"\\\"house\\\"\", \"abstractive_explanation\": \"\\\"a house is the only place that is not likely to s...\", \"extractive_explanation\": \"\\\"not for sale\\\"\"}", "columns": ["id", "question", "choices", "answer", "abstractive_explanation", "extractive_explanation"], "columns_mapping": {"id": "id", "question": "question", "choices": "choices", "answer": "answer", "abstractive_explanation": "abstractive_explanation", "extractive_explanation": "extractive_explanation"}, "dataset_description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "dataset_name": "cos_e"}, "v1.11": {"config_name": "v1.11", "sample_row": "{\"id\": \"\\\"6b819727eb8a670df26a7ffad036c119\\\"\", \"question\": \"\\\"\\\\\\\"There are 10 apples on an apple tree. Three fa...\", \"choices\": \"[\\\"park\\\", \\\"coloring book\\\", \\\"garden center\\\", \\\"math p...\", \"answer\": \"\\\"math problem\\\"\", \"abstractive_explanation\": \"\\\"webmath is designed to help you solve\\\"\", \"extractive_explanation\": \"\\\"\\\\\\\"there are 10 apples on an apple tree. three fal...\"}", "columns": ["id", "question", "choices", "answer", "abstractive_explanation", "extractive_explanation"], "columns_mapping": {"id": "id", "question": "question", "choices": "choices", "answer": "answer", "abstractive_explanation": "abstractive_explanation", "extractive_explanation": "extractive_explanation"}, "dataset_description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "dataset_name": "cos_e"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|commonsense_qa", "language:en"], "is_gated": false}, "cosmos_qa": {"dataset_name": "cosmos_qa", "description": "Cosmos QA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context", "downloads": 18607, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"3Q9SPIIRWJKVQ8244310E8TUS6YWAC##34V1S5K3GTZMDUBNB...\", \"context\": \"\\\"Good Old War and person L : I saw both of these b...\", \"question\": \"\\\"In the future , will this person go to see other ...\", \"answer0\": \"\\\"None of the above choices .\\\"\", \"answer1\": \"\\\"This person likes music and likes to see the show...\", \"answer2\": \"\\\"This person only likes Good Old War and Person L ...\", \"answer3\": \"\\\"Other Bands is not on tour and this person can no...\", \"label\": \"1\"}", "columns": ["id", "context", "question", "answer0", "answer1", "answer2", "answer3", "label"], "columns_mapping": {"id": "id", "context": "context", "question": "question", "answer0": "answer0", "answer1": "answer1", "answer2": "answer2", "answer3": "answer3", "label": "label"}, "dataset_description": "Cosmos QA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context\n", "dataset_name": "cosmos_qa"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "counter": {"dataset_name": "counter", "description": " The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.", "downloads": 293, "configs": {"default": {"config_name": "default", "sample_row": "{\"source.filename\": \"\\\"0001.xml\\\"\", \"source.headline\": \"\\\"\\\\u0628\\\\u0646\\\\u06af\\\\u0627\\\\u0644 \\\\u0679\\\\u0627\\\\u0626...\", \"source.body\": \"\\\"\\\\u0688\\\\u06be\\\\u0627\\\\u06a9\\\\u06c1 \\\\u06d4 \\\\u06cc\\\\u06a...\", \"source.total_number_of_words\": \"352\", \"source.total_number_of_sentences\": \"15\", \"source.number_of_words_with_swr\": \"245\", \"source.newspaper\": \"\\\"APP\\\"\", \"source.newsdate\": \"\\\"01.12.14\\\"\", \"source.domain\": \"1\", \"source.classification\": \"1\", \"derived.filename\": \"\\\"0001p.xml\\\"\", \"derived.headline\": \"\\\"\\\\u0628\\\\u0646\\\\u06af\\\\u0644\\\\u06c1 \\\\u062f\\\\u06cc\\\\u0634...\", \"derived.body\": \"\\\"\\\\u0645\\\\u06cc\\\\u0631 \\\\u067e\\\\u0648\\\\u0631(\\\\u0648\\\\u064...\", \"derived.total_number_of_words\": \"393\", \"derived.total_number_of_sentences\": \"13\", \"derived.number_of_words_with_swr\": \"265\", \"derived.newspaper\": \"\\\"daily_waqt\\\"\", \"derived.newsdate\": \"\\\"02.12.14\\\"\", \"derived.domain\": \"1\", \"derived.classification\": \"1\"}", "columns": ["source_filename", "source_headline", "source_body", "source_total_number_of_words", "source_total_number_of_sentences", "source_number_of_words_with_swr", "source_newspaper", "source_newsdate", "source_domain", "source_classification", "derived_filename", "derived_headline", "derived_body", "derived_total_number_of_words", "derived_total_number_of_sentences", "derived_number_of_words_with_swr", "derived_newspaper", "derived_newsdate", "derived_domain", "derived_classification"], "columns_mapping": {"source.filename": "source_filename", "source.headline": "source_headline", "source.body": "source_body", "source.total_number_of_words": "source_total_number_of_words", "source.total_number_of_sentences": "source_total_number_of_sentences", "source.number_of_words_with_swr": "source_number_of_words_with_swr", "source.newspaper": "source_newspaper", "source.newsdate": "source_newsdate", "source.domain": "source_domain", "source.classification": "source_classification", "derived.filename": "derived_filename", "derived.headline": "derived_headline", "derived.body": "derived_body", "derived.total_number_of_words": "derived_total_number_of_words", "derived.total_number_of_sentences": "derived_total_number_of_sentences", "derived.number_of_words_with_swr": "derived_number_of_words_with_swr", "derived.newspaper": "derived_newspaper", "derived.newsdate": "derived_newsdate", "derived.domain": "derived_domain", "derived.classification": "derived_classification"}, "dataset_description": " The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.\n", "dataset_name": "counter"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:semantic-similarity-scoring", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ur"], "is_gated": false}, "covid_qa_castorini": {"dataset_name": "covid_qa_castorini", "description": "CovidQA is the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.", "downloads": 359, "configs": {"covid_qa_castorini": {"config_name": "covid_qa_castorini", "sample_row": "{\"category_name\": \"\\\"Incubation period\\\"\", \"question_query\": \"\\\"What is the incubation period of the virus?\\\"\", \"keyword_query\": \"\\\"Incubation period of the virus\\\"\", \"answers.id\": \"[\\\"wuclekt6\\\", \\\"e3t1f0rt\\\", \\\"ragcpbl6\\\", \\\"n0uwy77g\\\", \\\"...\", \"answers.title\": \"[\\\"Longitudinal analysis of laboratory findings dur...\", \"answers.exact_answer\": \"[\\\"4 days (IQR, 2-7)\\\", \\\"5.84 (99% CI: 4.83, 6.85) d...\"}", "columns": ["category_name", "question_query", "keyword_query", "answers_id", "answers_title", "answers_exact_answer"], "columns_mapping": {"category_name": "category_name", "question_query": "question_query", "keyword_query": "keyword_query", "answers.id": "answers_id", "answers.title": "answers_title", "answers.exact_answer": "answers_exact_answer"}, "dataset_description": "CovidQA is the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.\n", "dataset_name": "covid_qa_castorini"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "crawl_domain": {"dataset_name": "crawl_domain", "description": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\"). Breaking domain names such as \"openresearch\" into component words \"open\" and \"research\" is important for applications such as Text-to-Speech synthesis and web search. Common Crawl is an open repository of web crawl data that can be accessed and analyzed by anyone. Specifically, we scraped the plaintext (WET) extracts for domain names from URLs that contained diverse letter casing (e.g. \"OpenBSD\"). Although in the previous example, segmentation is trivial using letter casing, this was not always the case (e.g. \"NASA\"), so we had to manually annotate the data. The dataset is stored as plaintext file where each line is an example of space separated segments of a domain name. The examples are stored in their original letter casing, but harder and more interesting examples can be generated by lowercasing the input first.", "downloads": 321, "configs": {"default": {"config_name": "default", "sample_row": "{\"example\": \"\\\"Insign is Interactive\\\"\"}", "columns": ["example"], "columns_mapping": {"example": "example"}, "dataset_description": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\"). Breaking domain names such as \"openresearch\" into component words \"open\" and \"research\" is important for applications such as Text-to-Speech synthesis and web search. Common Crawl is an open repository of web crawl data that can be accessed and analyzed by anyone. Specifically, we scraped the plaintext (WET) extracts for domain names from URLs that contained diverse letter casing (e.g. \"OpenBSD\"). Although in the previous example, segmentation is trivial using letter casing, this was not always the case (e.g. \"NASA\"), so we had to manually annotate the data. The dataset is stored as plaintext file where each line is an example of space separated segments of a domain name. The examples are stored in their original letter casing, but harder and more interesting examples can be generated by lowercasing the input first.", "dataset_name": "crawl_domain"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other-Common-Crawl", "source_datasets:original", "language:en", "web-search", "text-to-speech"], "is_gated": false}, "crd3": {"dataset_name": "crd3", "description": "Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.", "downloads": 313, "configs": {"default": {"config_name": "default", "sample_row": "{\"chunk\": \"\\\"Matthew Mercer introduces himself and the concept...\", \"chunk_id\": \"0\", \"turn_start\": \"0\", \"turn_end\": \"0\", \"alignment_score\": \"0.0\", \"turns\": \"[{\\\"names\\\": [\\\"MATT\\\"], \\\"utterances\\\": [\\\"Hello everyon...\"}", "columns": ["chunk", "chunk_id", "turn_start", "turn_end", "alignment_score", "turns"], "columns_mapping": {"chunk": "chunk", "chunk_id": "chunk_id", "turn_start": "turn_start", "turn_end": "turn_end", "alignment_score": "alignment_score", "turns": "turns"}, "dataset_description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "dataset_name": "crd3"}}, "tags": ["task_categories:summarization", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cs_restaurants": {"dataset_name": "cs_restaurants", "description": "This is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as\na translation of the English San Francisco Restaurants dataset by Wen et al. (2015).", "downloads": 721, "configs": {"CSRestaurants": {"config_name": "CSRestaurants", "sample_row": "{\"da\": \"\\\"inform(food=Indian,good_for_meal='lunch or dinner...\", \"delex_da\": \"\\\"inform(food=X-food,good_for_meal=X-good_for_meal,...\", \"text\": \"\\\"Ko\\\\u010d\\\\u00e1r z V\\\\u00eddn\\\\u011b pod\\\\u00e1v\\\\u00e...\", \"delex_text\": \"\\\"X-name pod\\\\u00e1v\\\\u00e1 X-food pokrmy a d\\\\u00e1 s...\"}", "columns": ["da", "delex_da", "text", "delex_text"], "columns_mapping": {"da": "da", "delex_da": "delex_da", "text": "text", "delex_text": "delex_text"}, "dataset_description": "This is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as\na translation of the English San Francisco Restaurants dataset by Wen et al. (2015).\n", "dataset_name": "cs_restaurants"}}, "tags": ["task_categories:text2text-generation", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|other-san-francisco-restaurants", "language:cs", "intent-to-text"], "is_gated": false}, "cuad": {"dataset_name": "cuad", "description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of more than 13,000 labels in 510\ncommercial legal contracts that have been manually labeled to identify 41 categories of important\nclauses that lawyers look for when reviewing contracts in connection with corporate transactions.", "downloads": 1065, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEME...\", \"title\": \"\\\"LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEME...\", \"context\": \"\\\"EXHIBIT 10.6\\\\n\\\\n DIS...\", \"question\": \"\\\"Highlight the parts (if any) of this contract rel...\", \"answers.text\": \"[\\\"DISTRIBUTOR AGREEMENT\\\"]\", \"answers.answer_start\": \"[44]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of more than 13,000 labels in 510\ncommercial legal contracts that have been manually labeled to identify 41 categories of important\nclauses that lawyers look for when reviewing contracts in connection with corporate transactions.\n", "dataset_name": "cuad"}}, "tags": ["task_categories:question-answering", "task_ids:closed-domain-qa", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "curiosity_dialogs": {"dataset_name": "curiosity_dialogs", "description": "This dataset contains 14K dialogs (181K utterances) where users and assistants converse about geographic topics like\ngeopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog\nacts, grounding to Wikipedia, and user reactions to messages.", "downloads": 312, "configs": {"curiosity_dialogs": {"config_name": "curiosity_dialogs", "sample_row": "{\"messages.message\": \"[\\\"Hi. I want information about Namibia.\\\", \\\"Nmbia i...\", \"messages.liked\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\", \"messages.sender\": \"[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]\", \"messages.facts\": \"[{\\\"fid\\\": [], \\\"used\\\": [], \\\"source\\\": []}, {\\\"fid\\\": [7...\", \"messages.message_id\": \"[\\\"617343895\\\", \\\"2842515356\\\", \\\"4240816985\\\", \\\"5207110...\", \"messages.dialog_acts\": \"[[\\\"request_topic\\\"], [\\\"inform_response\\\"], [\\\"request...\", \"known_entities\": \"[\\\"South Africa\\\", \\\"United Kingdom\\\", \\\"Portugal\\\"]\", \"focus_entity\": \"\\\"Namibia\\\"\", \"dialog_id\": \"21922\", \"inferred_steps\": \"1\", \"created_time\": \"1571783665\", \"aspects\": \"[\\\"Media\\\", \\\"Politics and government\\\"]\", \"first_aspect\": \"\\\"Media\\\"\", \"second_aspect\": \"\\\"Politics and government\\\"\", \"shuffle_facts\": \"1\", \"related_entities\": \"[\\\"Western Roman Empire\\\", \\\"United Kingdom\\\", \\\"Portug...\", \"tag\": \"\\\"round_2\\\"\", \"user_id\": \"207\", \"assistant_id\": \"341\", \"is_annotated\": \"0\", \"user_dialog_rating\": \"5\", \"user_other_agent_rating\": \"5\", \"assistant_dialog_rating\": \"5\", \"assistant_other_agent_rating\": \"5\", \"reported\": \"0\", \"annotated\": \"1\"}", "columns": ["messages_message", "messages_liked", "messages_sender", "messages_facts", "messages_message_id", "messages_dialog_acts", "known_entities", "focus_entity", "dialog_id", "inferred_steps", "created_time", "aspects", "first_aspect", "second_aspect", "shuffle_facts", "related_entities", "tag", "user_id", "assistant_id", "is_annotated", "user_dialog_rating", "user_other_agent_rating", "assistant_dialog_rating", "assistant_other_agent_rating", "reported", "annotated"], "columns_mapping": {"messages.message": "messages_message", "messages.liked": "messages_liked", "messages.sender": "messages_sender", "messages.facts": "messages_facts", "messages.message_id": "messages_message_id", "messages.dialog_acts": "messages_dialog_acts", "known_entities": "known_entities", "focus_entity": "focus_entity", "dialog_id": "dialog_id", "inferred_steps": "inferred_steps", "created_time": "created_time", "aspects": "aspects", "first_aspect": "first_aspect", "second_aspect": "second_aspect", "shuffle_facts": "shuffle_facts", "related_entities": "related_entities", "tag": "tag", "user_id": "user_id", "assistant_id": "assistant_id", "is_annotated": "is_annotated", "user_dialog_rating": "user_dialog_rating", "user_other_agent_rating": "user_other_agent_rating", "assistant_dialog_rating": "assistant_dialog_rating", "assistant_other_agent_rating": "assistant_other_agent_rating", "reported": "reported", "annotated": "annotated"}, "dataset_description": "This dataset contains 14K dialogs (181K utterances) where users and assistants converse about geographic topics like\ngeopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog\nacts, grounding to Wikipedia, and user reactions to messages.\n", "dataset_name": "curiosity_dialogs"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "conversational-curiosity"], "is_gated": false}, "dane": {"dataset_name": "dane", "description": "The DaNE dataset has been annotated with Named Entities for PER, ORG and LOC\nby the Alexandra Institute.\nIt is a reannotation of the UD-DDT (Universal Dependency - Danish Dependency Treebank)\nwhich has annotations for dependency parsing and part-of-speech (POS) tagging.\nThe Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of\nthe Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts\nfrom Parole (Britt, 1998).", "downloads": 857, "configs": {"default": {"config_name": "default", "sample_row": "{\"sent_id\": \"\\\"train-v2-0\\\\n\\\"\", \"text\": \"\\\"P\\\\u00e5 fredag har SID inviteret til reception i ...\", \"tok_ids\": \"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...\", \"tokens\": \"[\\\"P\\\\u00e5\\\", \\\"fredag\\\", \\\"har\\\", \\\"SID\\\", \\\"inviteret\\\", \\\"...\", \"lemmas\": \"[\\\"p\\\\u00e5\\\", \\\"fredag\\\", \\\"have\\\", \\\"SiD\\\", \\\"invitere\\\", \\\"...\", \"pos_tags\": \"[11, 12, 5, 7, 3, 11, 12, 11, 12, 11, 12, 11, 16, ...\", \"morph_tags\": \"[\\\"AdpType=Prep\\\", \\\"Definite=Ind|Gender=Com|Number=S...\", \"dep_ids\": \"[2, 5, 5, 5, 0, 7, 5, 9, 7, 11, 7, 17, 17, 17, 14,...\", \"dep_labels\": \"[35, 16, 28, 33, 19, 35, 16, 35, 18, 35, 18, 1, 1,...\", \"ner_tags\": \"[0, 0, 0, 3, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 1, 2, 0...\"}", "columns": ["sent_id", "text", "tok_ids", "tokens", "lemmas", "pos_tags", "morph_tags", "dep_ids", "dep_labels", "ner_tags"], "columns_mapping": {"sent_id": "sent_id", "text": "text", "tok_ids": "tok_ids", "tokens": "tokens", "lemmas": "lemmas", "pos_tags": "pos_tags", "morph_tags": "morph_tags", "dep_ids": "dep_ids", "dep_labels": "dep_labels", "ner_tags": "ner_tags"}, "dataset_description": "The DaNE dataset has been annotated with Named Entities for PER, ORG and LOC\nby the Alexandra Institute.\nIt is a reannotation of the UD-DDT (Universal Dependency - Danish Dependency Treebank)\nwhich has annotations for dependency parsing and part-of-speech (POS) tagging.\nThe Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of\nthe Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts\nfrom Parole (Britt, 1998).\n", "dataset_name": "dane"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other-Danish-Universal-Dependencies-treebank", "language:da"], "is_gated": false}, "danish_political_comments": {"dataset_name": "danish_political_comments", "description": "The dataset consists of 9008 sentences that are labelled with fine-grained polarity in the range from -2 to 2 (negative to postive). The quality of the fine-grained is not cross validated and is therefore subject to uncertainties; however, the simple polarity has been cross validated and therefore is considered to be more correct.", "downloads": 362, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"Synes i ikke det er synd for hende ja undskyld mi...\", \"target\": \"3\"}", "columns": ["id", "sentence", "target"], "columns_mapping": {"id": "id", "sentence": "sentence", "target": "target"}, "dataset_description": "The dataset consists of 9008 sentences that are labelled with fine-grained polarity in the range from -2 to 2 (negative to postive). The quality of the fine-grained is not cross validated and is therefore subject to uncertainties; however, the simple polarity has been cross validated and therefore is considered to be more correct.\n", "dataset_name": "danish_political_comments"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:da"], "is_gated": false}, "dart": {"dataset_name": "dart", "description": "DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality\nsentence annotations with each input being a set of entity-relation triples following a tree-structured ontology.\nIt consists of 82191 examples across different domains with each input being a semantic RDF triple set derived\nfrom data records in tables and the tree ontology of table schema, annotated with sentence description that\ncovers all facts in the triple set.\n\nDART is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/2007.02871", "downloads": 1029, "configs": {"default": {"config_name": "default", "sample_row": "{\"tripleset\": \"[[\\\"First Clearing\\\", \\\"LOCATION\\\", \\\"On NYS 52 1 Mi. Y...\", \"subtree_was_extended\": \"false\", \"annotations.source\": \"[\\\"WikiTableQuestions_mturk\\\"]\", \"annotations.text\": \"[\\\"First Clearing\\\\tbased on Callicoon, New York and...\"}", "columns": ["tripleset", "subtree_was_extended", "annotations_source", "annotations_text"], "columns_mapping": {"tripleset": "tripleset", "subtree_was_extended": "subtree_was_extended", "annotations.source": "annotations_source", "annotations.text": "annotations_text"}, "dataset_description": "DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality\nsentence annotations with each input being a set of entity-relation triples following a tree-structured ontology.\nIt consists of 82191 examples across different domains with each input being a semantic RDF triple set derived\nfrom data records in tables and the tree ontology of table schema, annotated with sentence description that\ncovers all facts in the triple set.\n\nDART is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/2007.02871\n", "dataset_name": "dart"}}, "tags": ["task_categories:tabular-to-text", "task_ids:rdf-to-text", "annotations_creators:crowdsourced", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|wikitable_questions", "source_datasets:extended|wikisql", "source_datasets:extended|web_nlg", "source_datasets:extended|cleaned_e2e", "language:en"], "is_gated": false}, "dbpedia_14": {"dataset_name": "dbpedia_14", "description": "The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes\nfrom DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we\nrandomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size\nof the training dataset is 560,000 and testing dataset 70,000.\nThere are 3 columns in the dataset (same for train and test splits), corresponding to class index\n(1 to 14), title and content. The title and content are escaped using double quotes (\"), and any\ninternal double quote is escaped by 2 double quotes (\"\"). There are no new lines in title or content.", "downloads": 6357, "configs": {"dbpedia_14": {"config_name": "dbpedia_14", "sample_row": "{\"label\": \"0\", \"title\": \"\\\"E. D. Abbott Ltd\\\"\", \"content\": \"\\\" Abbott of Farnham E D Abbott Limited was a Briti...\"}", "columns": ["label", "title", "content"], "columns_mapping": {"label": "label", "title": "title", "content": "content"}, "dataset_description": "The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes\nfrom DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we\nrandomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size\nof the training dataset is 560,000 and testing dataset 70,000.\nThere are 3 columns in the dataset (same for train and test splits), corresponding to class index\n(1 to 14), title and content. The title and content are escaped using double quotes (\"), and any\ninternal double quote is escaped by 2 double quotes (\"\"). There are no new lines in title or content.\n", "dataset_name": "dbpedia_14"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dbrd": {"dataset_name": "dbrd", "description": "The Dutch Book Review Dataset (DBRD) contains over 110k book reviews of which 22k have associated binary sentiment polarity labels. It is intended as a benchmark for sentiment classification in Dutch and created due to a lack of annotated datasets in Dutch that are suitable for this task.", "downloads": 1256, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"Na alle voorgaande boeken van Dan Brown gelezen t...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The Dutch Book Review Dataset (DBRD) contains over 110k book reviews of which 22k have associated binary sentiment polarity labels. It is intended as a benchmark for sentiment classification in Dutch and created due to a lack of annotated datasets in Dutch that are suitable for this task.\n", "dataset_name": "dbrd"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-classification", "task_ids:language-modeling", "task_ids:masked-language-modeling", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:nl"], "is_gated": false}, "deal_or_no_dialog": {"dataset_name": "deal_or_no_dialog", "description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.", "downloads": 875, "configs": {"dialogues": {"config_name": "dialogues", "sample_row": "{\"input.count\": \"[1, 4, 1]\", \"input.value\": \"[4, 1, 2]\", \"dialogue\": \"\\\"THEM: i would like 4 hats and you can have the re...\", \"output\": \"\\\"item0=1 item1=0 item2=1 item0=0 item1=4 item2=0\\\"...\", \"partner_input.count\": \"[1, 4, 1]\", \"partner_input.value\": \"[0, 2, 2]\"}", "columns": ["input_count", "input_value", "dialogue", "output", "partner_input_count", "partner_input_value"], "columns_mapping": {"input.count": "input_count", "input.value": "input_value", "dialogue": "dialogue", "output": "output", "partner_input.count": "partner_input_count", "partner_input.value": "partner_input_value"}, "dataset_description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.\n", "dataset_name": "deal_or_no_dialog"}, "self_play": {"config_name": "self_play", "sample_row": "{\"input.count\": \"[1, 1, 3]\", \"input.value\": \"[0, 1, 3]\"}", "columns": ["input_count", "input_value"], "columns_mapping": {"input.count": "input_count", "input.value": "input_value"}, "dataset_description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.\n", "dataset_name": "deal_or_no_dialog"}}, "tags": ["task_categories:conversational", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "definite_pronoun_resolution": {"dataset_name": "definite_pronoun_resolution", "description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.", "downloads": 338, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"sentence\": \"\\\"The bee landed on the flower because it had polle...\", \"pronoun\": \"\\\"it\\\"\", \"candidates\": \"[\\\"The bee\\\", \\\"the flower\\\"]\", \"label\": \"1\"}", "columns": ["sentence", "pronoun", "candidates", "label"], "columns_mapping": {"sentence": "sentence", "pronoun": "pronoun", "candidates": "candidates", "label": "label"}, "dataset_description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.\n", "dataset_name": "definite_pronoun_resolution"}}, "tags": ["task_categories:token-classification", "task_ids:word-sense-disambiguation", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dengue_filipino": {"dataset_name": "dengue_filipino", "description": " Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets.", "downloads": 307, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Not a good time to get sick.\\\"\", \"absent\": \"0\", \"dengue\": \"0\", \"health\": \"1\", \"mosquito\": \"0\", \"sick\": \"1\"}", "columns": ["text", "absent", "dengue", "health", "mosquito", "sick"], "columns_mapping": {"text": "text", "absent": "absent", "dengue": "dengue", "health": "health", "mosquito": "mosquito", "sick": "sick"}, "dataset_description": " Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets.\n", "dataset_name": "dengue_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "dialog_re": {"dataset_name": "dialog_re", "description": "DialogRE is the first human-annotated dialogue based relation extraction (RE) dataset aiming\nto support the prediction of relation(s) between two arguments that appear in a dialogue.\nThe dataset annotates all occurrences of 36 possible relation types that exist between pairs\nof arguments in the 1,788 dialogues originating from the complete transcripts of Friends.", "downloads": 293, "configs": {"dialog_re": {"config_name": "dialog_re", "sample_row": "{\"dialog\": \"[\\\"Speaker 1: It's been an hour and not one of my c...\", \"relation_data.x\": \"[\\\"Speaker 2\\\", \\\"Speaker 2\\\", \\\"Speaker 4\\\", \\\"Speaker 4...\", \"relation_data.y\": \"[\\\"Chandler Bing\\\", \\\"Speaker 4\\\", \\\"Tom Gordon\\\", \\\"Spea...\", \"relation_data.x_type\": \"[\\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\"]\", \"relation_data.y_type\": \"[\\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\"]\", \"relation_data.r\": \"[[\\\"per:alternate_names\\\"], [\\\"per:alumni\\\"], [\\\"per:al...\", \"relation_data.rid\": \"[[30], [4], [30], [4, 1], [30], [37]]\", \"relation_data.t\": \"[[\\\"\\\"], [\\\"\\\"], [\\\"\\\"], [\\\"\\\", \\\"call me\\\"], [\\\"\\\"], [\\\"\\\"]]\"}", "columns": ["dialog", "relation_data_x", "relation_data_y", "relation_data_x_type", "relation_data_y_type", "relation_data_r", "relation_data_rid", "relation_data_t"], "columns_mapping": {"dialog": "dialog", "relation_data.x": "relation_data_x", "relation_data.y": "relation_data_y", "relation_data.x_type": "relation_data_x_type", "relation_data.y_type": "relation_data_y_type", "relation_data.r": "relation_data_r", "relation_data.rid": "relation_data_rid", "relation_data.t": "relation_data_t"}, "dataset_description": "DialogRE is the first human-annotated dialogue based relation extraction (RE) dataset aiming\nto support the prediction of relation(s) between two arguments that appear in a dialogue.\nThe dataset annotates all occurrences of 36 possible relation types that exist between pairs\nof arguments in the 1,788 dialogues originating from the complete transcripts of Friends.\n", "dataset_name": "dialog_re"}}, "tags": ["task_categories:other", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "relation-extraction"], "is_gated": false}, "disaster_response_messages": {"dataset_name": "disaster_response_messages", "description": "This dataset contains 30,000 messages drawn from events including an earthquake in Haiti in 2010, an earthquake in Chile in 2010, floods in Pakistan in 2010, super-storm Sandy in the U.S.A. in 2012, and news articles spanning a large number of years and 100s of different disasters.\nThe data has been encoded with 36 different categories related to disaster response and has been stripped of messages with sensitive information in their entirety.\nUpon release, this is the featured dataset of a new Udacity course on Data Science and the AI4ALL summer school and is especially utile for text analytics and natural language processing (NLP) tasks and models.\nThe input data in this job contains thousands of untranslated disaster-related messages and their English translations.", "downloads": 356, "configs": {"default": {"config_name": "default", "sample_row": "{\"split\": \"\\\"train\\\"\", \"message\": \"\\\"Weather update - a cold front from Cuba that coul...\", \"original\": \"\\\"Un front froid se retrouve sur Cuba ce matin. Il ...\", \"genre\": \"\\\"direct\\\"\", \"related\": \"1\", \"PII\": \"0\", \"request\": \"0\", \"offer\": \"0\", \"aid_related\": \"0\", \"medical_help\": \"0\", \"medical_products\": \"0\", \"search_and_rescue\": \"0\", \"security\": \"0\", \"military\": \"0\", \"child_alone\": \"0\", \"water\": \"0\", \"food\": \"0\", \"shelter\": \"0\", \"clothing\": \"0\", \"money\": \"0\", \"missing_people\": \"0\", \"refugees\": \"0\", \"death\": \"0\", \"other_aid\": \"0\", \"infrastructure_related\": \"0\", \"transport\": \"0\", \"buildings\": \"0\", \"electricity\": \"0\", \"tools\": \"0\", \"hospitals\": \"0\", \"shops\": \"0\", \"aid_centers\": \"0\", \"other_infrastructure\": \"0\", \"weather_related\": \"0\", \"floods\": \"0\", \"storm\": \"0\", \"fire\": \"0\", \"earthquake\": \"0\", \"cold\": \"0\", \"other_weather\": \"0\", \"direct_report\": \"0\"}", "columns": ["split", "message", "original", "genre", "related", "PII", "request", "offer", "aid_related", "medical_help", "medical_products", "search_and_rescue", "security", "military", "child_alone", "water", "food", "shelter", "clothing", "money", "missing_people", "refugees", "death", "other_aid", "infrastructure_related", "transport", "buildings", "electricity", "tools", "hospitals", "shops", "aid_centers", "other_infrastructure", "weather_related", "floods", "storm", "fire", "earthquake", "cold", "other_weather", "direct_report"], "columns_mapping": {"split": "split", "message": "message", "original": "original", "genre": "genre", "related": "related", "PII": "PII", "request": "request", "offer": "offer", "aid_related": "aid_related", "medical_help": "medical_help", "medical_products": "medical_products", "search_and_rescue": "search_and_rescue", "security": "security", "military": "military", "child_alone": "child_alone", "water": "water", "food": "food", "shelter": "shelter", "clothing": "clothing", "money": "money", "missing_people": "missing_people", "refugees": "refugees", "death": "death", "other_aid": "other_aid", "infrastructure_related": "infrastructure_related", "transport": "transport", "buildings": "buildings", "electricity": "electricity", "tools": "tools", "hospitals": "hospitals", "shops": "shops", "aid_centers": "aid_centers", "other_infrastructure": "other_infrastructure", "weather_related": "weather_related", "floods": "floods", "storm": "storm", "fire": "fire", "earthquake": "earthquake", "cold": "cold", "other_weather": "other_weather", "direct_report": "direct_report"}, "dataset_description": "This dataset contains 30,000 messages drawn from events including an earthquake in Haiti in 2010, an earthquake in Chile in 2010, floods in Pakistan in 2010, super-storm Sandy in the U.S.A. in 2012, and news articles spanning a large number of years and 100s of different disasters.\nThe data has been encoded with 36 different categories related to disaster response and has been stripped of messages with sensitive information in their entirety.\nUpon release, this is the featured dataset of a new Udacity course on Data Science and the AI4ALL summer school and is especially utile for text analytics and natural language processing (NLP) tasks and models.\nThe input data in this job contains thousands of untranslated disaster-related messages and their English translations.\n", "dataset_name": "disaster_response_messages"}}, "tags": ["task_categories:text2text-generation", "task_categories:text-classification", "task_ids:intent-classification", "task_ids:sentiment-classification", "task_ids:text-simplification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:es", "language:fr", "language:ht", "language:ur"], "is_gated": false}, "discofuse": {"dataset_name": "discofuse", "description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.", "downloads": 546, "configs": {"discofuse-sport": {"config_name": "discofuse-sport", "sample_row": "{\"connective_string\": \"\\\"\\\"\", \"discourse_type\": \"\\\"PAIR_ANAPHORA\\\"\", \"coherent_second_sentence\": \"\\\"They have such things as video tapes , coaching s...\", \"has_coref_type_pronoun\": \"1.0\", \"incoherent_first_sentence\": \"\\\"For hockey resouces , please contact the ODCO .\\\"...\", \"incoherent_second_sentence\": \"\\\"ODCO have such things as video tapes , coaching s...\", \"has_coref_type_nominal\": \"0.0\", \"coherent_first_sentence\": \"\\\"For hockey resouces , please contact the ODCO .\\\"...\"}", "columns": ["connective_string", "discourse_type", "coherent_second_sentence", "has_coref_type_pronoun", "incoherent_first_sentence", "incoherent_second_sentence", "has_coref_type_nominal", "coherent_first_sentence"], "columns_mapping": {"connective_string": "connective_string", "discourse_type": "discourse_type", "coherent_second_sentence": "coherent_second_sentence", "has_coref_type_pronoun": "has_coref_type_pronoun", "incoherent_first_sentence": "incoherent_first_sentence", "incoherent_second_sentence": "incoherent_second_sentence", "has_coref_type_nominal": "has_coref_type_nominal", "coherent_first_sentence": "coherent_first_sentence"}, "dataset_description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "dataset_name": "discofuse"}, "discofuse-wikipedia": {"config_name": "discofuse-wikipedia", "sample_row": "{\"connective_string\": \"\\\"\\\"\", \"discourse_type\": \"\\\"PAIR_ANAPHORA\\\"\", \"coherent_second_sentence\": \"\\\"It is located in Nodaway Township .\\\"\", \"has_coref_type_pronoun\": \"1.0\", \"incoherent_first_sentence\": \"\\\"Clarinda is a city in and the county seat of Page...\", \"incoherent_second_sentence\": \"\\\"Clarinda is located in Nodaway Township .\\\"\", \"has_coref_type_nominal\": \"0.0\", \"coherent_first_sentence\": \"\\\"Clarinda is a city in and the county seat of Page...\"}", "columns": ["connective_string", "discourse_type", "coherent_second_sentence", "has_coref_type_pronoun", "incoherent_first_sentence", "incoherent_second_sentence", "has_coref_type_nominal", "coherent_first_sentence"], "columns_mapping": {"connective_string": "connective_string", "discourse_type": "discourse_type", "coherent_second_sentence": "coherent_second_sentence", "has_coref_type_pronoun": "has_coref_type_pronoun", "incoherent_first_sentence": "incoherent_first_sentence", "incoherent_second_sentence": "incoherent_second_sentence", "has_coref_type_nominal": "has_coref_type_nominal", "coherent_first_sentence": "coherent_first_sentence"}, "dataset_description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "dataset_name": "discofuse"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "sentence-fusion"], "is_gated": false}, "disfl_qa": {"dataset_name": "disfl_qa", "description": "Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\na source of distractors.\n\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\ntesting robustness of models against disfluent inputs.\n\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\nDisfl-QA. Detailed experiments and analyses can be found in our paper.", "downloads": 310, "configs": {"default": {"config_name": "default", "sample_row": "{\"squad_v2_id\": \"\\\"5a5918ff3e1742001a15cf7e\\\"\", \"original question\": \"\\\"What do unstable isotope studies indicate?\\\"\", \"disfluent question\": \"\\\"What do petrologists no what do unstable isotope ...\", \"title\": \"\\\"Geology\\\"\", \"context\": \"\\\"In addition to identifying rocks in the field, pe...\", \"answers.text\": \"[]\", \"answers.answer_start\": \"[]\"}", "columns": ["squad_v2_id", "original question", "disfluent question", "title", "context", "answers_text", "answers_answer_start"], "columns_mapping": {"squad_v2_id": "squad_v2_id", "original question": "original question", "disfluent question": "disfluent question", "title": "title", "context": "context", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\na source of distractors.\n\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\ntesting robustness of models against disfluent inputs.\n\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\nDisfl-QA. Detailed experiments and analyses can be found in our paper.\n", "dataset_name": "disfl_qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dream": {"dataset_name": "dream", "description": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.", "downloads": 7604, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"0\", \"dialogue_id\": \"\\\"5-510\\\"\", \"dialogue\": \"[\\\"M: I am considering dropping my dancing class. I...\", \"question\": \"\\\"What does the man suggest the woman do?\\\"\", \"choice\": \"[\\\"Consult her dancing teacher.\\\", \\\"Take a more inte...\", \"answer\": \"\\\"Continue her dancing class.\\\"\"}", "columns": ["id", "dialogue_id", "dialogue", "question", "choice", "answer"], "columns_mapping": {"id": "id", "dialogue_id": "dialogue_id", "dialogue": "dialogue", "question": "question", "choice": "choice", "answer": "answer"}, "dataset_description": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.\n", "dataset_name": "dream"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "drop": {"dataset_name": "drop", "description": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs.\n. DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a\nquestion, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or\n sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was\n necessary for prior datasets.", "downloads": 2571, "configs": {"default": {"config_name": "default", "sample_row": "{\"section_id\": \"\\\"nfl_2201\\\"\", \"query_id\": \"\\\"f16c0ee7-f131-4a8b-a6ac-4d275ea68066\\\"\", \"passage\": \"\\\"To start the season, the Lions traveled south to ...\", \"question\": \"\\\"How many points did the buccaneers need to tie in...\", \"answers_spans.spans\": \"[\\\"3\\\"]\", \"answers_spans.types\": \"[\\\"number\\\"]\"}", "columns": ["section_id", "query_id", "passage", "question", "answers_spans_spans", "answers_spans_types"], "columns_mapping": {"section_id": "section_id", "query_id": "query_id", "passage": "passage", "question": "question", "answers_spans.spans": "answers_spans_spans", "answers_spans.types": "answers_spans_types"}, "dataset_description": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs.\n. DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a\nquestion, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or\n sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was\n necessary for prior datasets.\n", "dataset_name": "drop"}}, "tags": ["task_categories:question-answering", "task_categories:text2text-generation", "task_ids:extractive-qa", "task_ids:abstractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dutch_social": {"dataset_name": "dutch_social", "description": "The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to\ncontain tweets in Dutch language or by users who have specified their location information within Netherlands\ngeographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes.\nIf the user has provided their location within Dutch boundaries, we have also classified them to their respective\nprovinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible,\nInteroperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International\n(CC BY-NC 4.0) (2020-10-27)", "downloads": 319, "configs": {"dutch_social": {"config_name": "dutch_social", "sample_row": "{\"full_text\": \"\\\"Maar , er iets nuttigs mee doen ? Zie jij 'm vert...\", \"text_translation\": \"\\\"However, there is something useful to do with it?...\", \"screen_name\": \"\\\"RonaldMeeuwis\\\"\", \"description\": \"\\\"None\\\"\", \"desc_translation\": \"\\\"None\\\"\", \"location\": \"\\\"None\\\"\", \"weekofyear\": \"21\", \"weekday\": \"3\", \"month\": \"5\", \"year\": \"2020\", \"day\": \"21\", \"point_info\": \"\\\"\\\"\", \"point\": \"\\\"None\\\"\", \"latitude\": \"0.0\", \"longitude\": \"0.0\", \"altitude\": \"0.0\", \"province\": \"\\\"False\\\"\", \"hisco_standard\": \"\\\"None\\\"\", \"hisco_code\": \"\\\"None\\\"\", \"industry\": \"false\", \"sentiment_pattern\": \"0.0\", \"subjective_pattern\": \"0.0\", \"label\": \"1\"}", "columns": ["full_text", "text_translation", "screen_name", "description", "desc_translation", "location", "weekofyear", "weekday", "month", "year", "day", "point_info", "point", "latitude", "longitude", "altitude", "province", "hisco_standard", "hisco_code", "industry", "sentiment_pattern", "subjective_pattern", "label"], "columns_mapping": {"full_text": "full_text", "text_translation": "text_translation", "screen_name": "screen_name", "description": "description", "desc_translation": "desc_translation", "location": "location", "weekofyear": "weekofyear", "weekday": "weekday", "month": "month", "year": "year", "day": "day", "point_info": "point_info", "point": "point", "latitude": "latitude", "longitude": "longitude", "altitude": "altitude", "province": "province", "hisco_standard": "hisco_standard", "hisco_code": "hisco_code", "industry": "industry", "sentiment_pattern": "sentiment_pattern", "subjective_pattern": "subjective_pattern", "label": "label"}, "dataset_description": "The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to\ncontain tweets in Dutch language or by users who have specified their location information within Netherlands\ngeographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes.\nIf the user has provided their location within Dutch boundaries, we have also classified them to their respective\nprovinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible,\nInteroperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International\n(CC BY-NC 4.0) (2020-10-27)\n", "dataset_name": "dutch_social"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:multi-label-classification", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:nl"], "is_gated": false}, "dyk": {"dataset_name": "dyk", "description": "The Did You Know (pol. Czy wiesz?) dataset consists of human-annotated question-answer pairs. The task is to predict if the answer is correct. We chose the negatives which have the largest token overlap with a question.", "downloads": 382, "configs": {"default": {"config_name": "default", "sample_row": "{\"q_id\": \"\\\"czywiesz4068\\\"\", \"question\": \"\\\"z jakiego powodu zwo\\\\u0142ano synod w Whitby?\\\"\", \"answer\": \"\\\"W\\\\u015br\\\\u00f3d mnich\\\\u00f3w i mniszek mieszkaj\\\\u...\", \"target\": \"0\"}", "columns": ["q_id", "question", "answer", "target"], "columns_mapping": {"q_id": "q_id", "question": "question", "answer": "answer", "target": "target"}, "dataset_description": "The Did You Know (pol. Czy wiesz?) dataset consists of human-annotated question-answer pairs. The task is to predict if the answer is correct. We chose the negatives which have the largest token overlap with a question.\n", "dataset_name": "dyk"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "e2e_nlg": {"dataset_name": "e2e_nlg", "description": "The E2E dataset is used for training end-to-end, data-driven natural language generation systems in the restaurant domain, which is ten times bigger than existing, frequently used datasets in this area.\nThe E2E dataset poses new challenges:\n(1) its human reference texts show more lexical richness and syntactic variation, including discourse phenomena;\n(2) generating from this set requires content selection. As such, learning from this dataset promises more natural, varied and less template-like system utterances.\n\nE2E is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/1706.09254", "downloads": 1908, "configs": {"default": {"config_name": "default", "sample_row": "{\"meaning_representation\": \"\\\"name[The Vaults], eatType[pub], priceRange[more t...\", \"human_reference\": \"\\\"The Vaults pub near Caf\\\\u00e9 Adriatic has a 5 st...\"}", "columns": ["meaning_representation", "human_reference"], "columns_mapping": {"meaning_representation": "meaning_representation", "human_reference": "human_reference"}, "dataset_description": "The E2E dataset is used for training end-to-end, data-driven natural language generation systems in the restaurant domain, which is ten times bigger than existing, frequently used datasets in this area.\nThe E2E dataset poses new challenges:\n(1) its human reference texts show more lexical richness and syntactic variation, including discourse phenomena;\n(2) generating from this set requires content selection. As such, learning from this dataset promises more natural, varied and less template-like system utterances.\n\nE2E is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/1706.09254\n", "dataset_name": "e2e_nlg"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "meaning-representation-to-text"], "is_gated": false}, "e2e_nlg_cleaned": {"dataset_name": "e2e_nlg_cleaned", "description": "An update release of E2E NLG Challenge data with cleaned MRs and scripts, accompanying the following paper:\n\nOnd\u0159ej Du\u0161ek, David M. Howcroft, and Verena Rieser (2019): Semantic Noise Matters for Neural Natural Language Generation. In INLG, Tokyo, Japan.", "downloads": 830, "configs": {"default": {"config_name": "default", "sample_row": "{\"meaning_representation\": \"\\\"name[The Eagle], eatType[coffee shop], food[Japan...\", \"human_reference\": \"\\\"The Eagle is a low rated coffee shop near Burger ...\"}", "columns": ["meaning_representation", "human_reference"], "columns_mapping": {"meaning_representation": "meaning_representation", "human_reference": "human_reference"}, "dataset_description": "An update release of E2E NLG Challenge data with cleaned MRs and scripts, accompanying the following paper:\n\nOnd\u0159ej Du\u0161ek, David M. Howcroft, and Verena Rieser (2019): Semantic Noise Matters for Neural Natural Language Generation. In INLG, Tokyo, Japan.\n", "dataset_name": "e2e_nlg_cleaned"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "meaning-representation-to-text"], "is_gated": false}, "ecb": {"dataset_name": "ecb", "description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M", "downloads": 991, "configs": {"de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Navigation Path : Home > The European Central ...\", \"translation.fr\": \"\\\"Navigation Path : Home > The European Central ...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"Navigation Path : Home > The European Central ...\", \"translation.en\": \"\\\"Navigation Path : Home > The European Central ...\"}", "columns": ["id", "translation_cs", "translation_en"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.en": "translation_en"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "el-it": {"config_name": "el-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"EL\\\"\", \"translation.it\": \"\\\"IT\\\"\"}", "columns": ["id", "translation_el", "translation_it"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.it": "translation_it"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "en-nl": {"config_name": "en-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"This message is formulated in collaboration with ...\", \"translation.nl\": \"\\\"Bijgaand bericht is opgesteld in overleg met Chri...\"}", "columns": ["id", "translation_en", "translation_nl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.nl": "translation_nl"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "fi-pl": {"config_name": "fi-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Py\\\\u00f6ristyksist\\\\u00e4 johtuen yhteenlaskut eiv...\", \"translation.pl\": \"\\\"Poszczeg\\\\u00f3lne pozycje mog\\\\u0105 nie sumowa\\\\u0...\"}", "columns": ["id", "translation_fi", "translation_pl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.pl": "translation_pl"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:sk", "language:sl"], "is_gated": false}, "ecthr_cases": {"dataset_name": "ecthr_cases", "description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.", "downloads": 1024, "configs": {"alleged-violation-prediction": {"config_name": "alleged-violation-prediction", "sample_row": "{\"facts\": \"[\\\"11. At the beginning of the events relevant to ...\", \"labels\": \"[\\\"13\\\", \\\"8\\\"]\", \"silver_rationales\": \"[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31...\", \"gold_rationales\": \"[]\"}", "columns": ["facts", "labels", "silver_rationales", "gold_rationales"], "columns_mapping": {"facts": "facts", "labels": "labels", "silver_rationales": "silver_rationales", "gold_rationales": "gold_rationales"}, "dataset_description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.\n", "dataset_name": "ecthr_cases"}, "violation-prediction": {"config_name": "violation-prediction", "sample_row": "{\"facts\": \"[\\\"11. At the beginning of the events relevant to ...\", \"labels\": \"[\\\"8\\\"]\", \"silver_rationales\": \"[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31...\"}", "columns": ["facts", "labels", "silver_rationales"], "columns_mapping": {"facts": "facts", "labels": "labels", "silver_rationales": "silver_rationales"}, "dataset_description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.\n", "dataset_name": "ecthr_cases"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "rationale-extraction", "legal-judgment-prediction"], "is_gated": false}, "ehealth_kd": {"dataset_name": "ehealth_kd", "description": "Dataset of the eHealth Knowledge Discovery Challenge at IberLEF 2020. It is designed for\nthe identification of semantic entities and relations in Spanish health documents.", "downloads": 290, "configs": {"ehealth_kd": {"config_name": "ehealth_kd", "sample_row": "{\"sentence\": \"\\\"En la leucemia linfoc\\\\u00edtica cr\\\\u00f3nica, hay...\", \"entities\": \"[{\\\"ent_id\\\": \\\"T1\\\", \\\"ent_text\\\": \\\"leucemia linfoc\\\\u00...\", \"relations\": \"[{\\\"rel_id\\\": \\\"R0\\\", \\\"rel_label\\\": 0, \\\"arg1\\\": \\\"T2\\\", \\\"a...\"}", "columns": ["sentence", "entities", "relations"], "columns_mapping": {"sentence": "sentence", "entities": "entities", "relations": "relations"}, "dataset_description": "Dataset of the eHealth Knowledge Discovery Challenge at IberLEF 2020. It is designed for\nthe identification of semantic entities and relations in Spanish health documents.\n", "dataset_name": "ehealth_kd"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:es", "relation-prediction"], "is_gated": false}, "eli5_category": {"dataset_name": "eli5_category", "description": "The ELI5-Category dataset is a smaller but newer and categorized version of the original ELI5 dataset. After 2017, a tagging system was introduced to this subreddit so that the questions can be categorized into different topics according to their tags. Since the training and validation set is built by questions in different topics, the dataset is expected to alleviate the train/validation overlapping issue in the original ELI5 dataset.", "downloads": 411, "configs": {"default": {"config_name": "default", "sample_row": "{\"q_id\": \"\\\"5lchat\\\"\", \"title\": \"\\\"Why there was a 'leap second' added to the end of...\", \"selftext\": \"\\\"\\\"\", \"category\": \"\\\"Other\\\"\", \"subreddit\": \"\\\"explainlikeimfive\\\"\", \"answers.a_id\": \"[\\\"dbuoyxl\\\", \\\"dbur7gi\\\", \\\"dbuotht\\\"]\", \"answers.text\": \"[\\\"the rotation of the earth is not a constant. in ...\", \"answers.score\": \"[44, 5, 4]\", \"answers.text_urls\": \"[[], [\\\"http://adminhacks.com/leap-second-bugs.html...\", \"title_urls\": \"[\\\"url\\\"]\", \"selftext_urls\": \"[\\\"url\\\"]\"}", "columns": ["q_id", "title", "selftext", "category", "subreddit", "answers_a_id", "answers_text", "answers_score", "answers_text_urls", "title_urls", "selftext_urls"], "columns_mapping": {"q_id": "q_id", "title": "title", "selftext": "selftext", "category": "category", "subreddit": "subreddit", "answers.a_id": "answers_a_id", "answers.text": "answers_text", "answers.score": "answers_score", "answers.text_urls": "answers_text_urls", "title_urls": "title_urls", "selftext_urls": "selftext_urls"}, "dataset_description": "The ELI5-Category dataset is a smaller but newer and categorized version of the original ELI5 dataset. After 2017, a tagging system was introduced to this subreddit so that the questions can be categorized into different topics according to their tags. Since the training and validation set is built by questions in different topics, the dataset is expected to alleviate the train/validation overlapping issue in the original ELI5 dataset.\n", "dataset_name": "eli5_category"}}, "tags": ["task_categories:text2text-generation", "task_ids:abstractive-qa", "task_ids:open-domain-abstractive-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|eli5", "language:en"], "is_gated": false}, "emea": {"dataset_name": "emea", "description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M", "downloads": 858, "configs": {"bg-el": {"config_name": "bg-el", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"European Medicines Agency\\\"\", \"translation.el\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_bg", "translation_el"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.el": "translation_el"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "cs-et": {"config_name": "cs-et", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"European Medicines Agency\\\"\", \"translation.et\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_cs", "translation_et"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.et": "translation_et"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "de-mt": {"config_name": "de-mt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"European Medicines Agency\\\"\", \"translation.mt\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_de", "translation_mt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.mt": "translation_mt"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "fr-sk": {"config_name": "fr-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"European Medicines Agency\\\"\", \"translation.sk\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_fr", "translation_sk"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sk": "translation_sk"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "es-lt": {"config_name": "es-lt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"European Medicines Agency\\\"\", \"translation.lt\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_es", "translation_lt"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.lt": "translation_lt"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "emo": {"dataset_name": "emo", "description": "In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others.", "downloads": 657, "configs": {"emo2019": {"config_name": "emo2019", "sample_row": "{\"text\": \"\\\"don't worry i'm girl hmm how do i know if you ar...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others.\n", "dataset_name": "emo"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dair-ai/emotion": {"dataset_name": "dair-ai/emotion", "description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.", "downloads": 22353, "configs": {"split": {"config_name": "split", "sample_row": "{\"text\": \"\\\"i didnt feel humiliated\\\"\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n", "dataset_name": "dair-ai/emotion"}, "unsplit": {"config_name": "unsplit", "sample_row": "{\"text\": \"\\\"i feel awful about it too because it s my job to ...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n", "dataset_name": "dair-ai/emotion"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "emotion-classification"], "is_gated": false}, "emotone_ar": {"dataset_name": "emotone_ar", "description": "Dataset of 10065 tweets in Arabic for Emotion detection in Arabic text", "downloads": 399, "configs": {"default": {"config_name": "default", "sample_row": "{\"tweet\": \"\\\"\\\\u0627\\\\u0644\\\\u0627\\\\u0648\\\\u0644\\\\u064a\\\\u0645\\\\u0628\\\\...\", \"label\": \"0\"}", "columns": ["tweet", "label"], "columns_mapping": {"tweet": "tweet", "label": "label"}, "dataset_description": "Dataset of 10065 tweets in Arabic for Emotion detection in Arabic text", "dataset_name": "emotone_ar"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "empathetic_dialogues": {"dataset_name": "empathetic_dialogues", "description": "PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset", "downloads": 1076, "configs": {"default": {"config_name": "default", "sample_row": "{\"conv_id\": \"\\\"hit:0_conv:1\\\"\", \"utterance_idx\": \"1\", \"context\": \"\\\"sentimental\\\"\", \"prompt\": \"\\\"I remember going to the fireworks with my best fr...\", \"speaker_idx\": \"1\", \"utterance\": \"\\\"I remember going to see the fireworks with my bes...\", \"selfeval\": \"\\\"5|5|5_2|2|5\\\"\", \"tags\": \"\\\"\\\"\"}", "columns": ["conv_id", "utterance_idx", "context", "prompt", "speaker_idx", "utterance", "selfeval", "tags"], "columns_mapping": {"conv_id": "conv_id", "utterance_idx": "utterance_idx", "context": "context", "prompt": "prompt", "speaker_idx": "speaker_idx", "utterance": "utterance", "selfeval": "selfeval", "tags": "tags"}, "dataset_description": "PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset\n", "dataset_name": "empathetic_dialogues"}}, "tags": ["task_categories:conversational", "task_categories:question-answering", "task_ids:dialogue-generation", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "eraser_multi_rc": {"dataset_name": "eraser_multi_rc", "description": "Eraser Multi RC is a dataset for queries over multi-line passages, along with\nanswers and a rationalte. Each example in this dataset has the following 5 parts\n1. A Mutli-line Passage\n2. A Query about the passage\n3. An Answer to the query\n4. A Classification as to whether the answer is right or wrong\n5. An Explanation justifying the classification", "downloads": 694, "configs": {"default": {"config_name": "default", "sample_row": "{\"passage\": \"\\\"As his car slid downtown on Tuesday morning the m...\", \"query_and_answer\": \"\\\"How does Mr. Thorndike act upon his impulse ? || ...\", \"label\": \"0\", \"evidences\": \"[\\\"It was these same impulses , leading so invariab...\"}", "columns": ["passage", "query_and_answer", "label", "evidences"], "columns_mapping": {"passage": "passage", "query_and_answer": "query_and_answer", "label": "label", "evidences": "evidences"}, "dataset_description": "\nEraser Multi RC is a dataset for queries over multi-line passages, along with\nanswers and a rationalte. Each example in this dataset has the following 5 parts\n1. A Mutli-line Passage\n2. A Query about the passage\n3. An Answer to the query\n4. A Classification as to whether the answer is right or wrong\n5. An Explanation justifying the classification\n", "dataset_name": "eraser_multi_rc"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "eth_py150_open": {"dataset_name": "eth_py150_open", "description": "A redistributable subset of the ETH Py150 corpus, introduced in the ICML 2020 paper 'Learning and Evaluating Contextual Embedding of Source Code'", "downloads": 296, "configs": {"eth_py150_open": {"config_name": "eth_py150_open", "sample_row": "{\"filepath\": \"\\\"05bit/django-smarter/example/example/settings.py\\\"...\", \"license\": \"\\\"bsd-3-clause\\\"\"}", "columns": ["filepath", "license"], "columns_mapping": {"filepath": "filepath", "license": "license"}, "dataset_description": "A redistributable subset of the ETH Py150 corpus, introduced in the ICML 2020 paper 'Learning and Evaluating Contextual Embedding of Source Code'\n", "dataset_name": "eth_py150_open"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "contextual-embeddings"], "is_gated": false}, "ethos": {"dataset_name": "ethos", "description": "ETHOS: onlinE haTe speecH detectiOn dataSet. This repository contains a dataset for hate speech\ndetection on social media platforms, called Ethos. There are two variations of the dataset:\n\nEthos_Dataset_Binary: contains 998 comments in the dataset alongside with a label\nabout hate speech presence or absence. 565 of them do not contain hate speech,\nwhile the rest of them, 433, contain.\n\nEthos_Dataset_Multi_Label: which contains 8 labels for the 433 comments with hate speech content.\nThese labels are violence (if it incites (1) or not (0) violence), directed_vs_general (if it is\ndirected to a person (1) or a group (0)), and 6 labels about the category of hate speech like,\ngender, race, national_origin, disability, religion and sexual_orientation.", "downloads": 7060, "configs": {"binary": {"config_name": "binary", "sample_row": "{\"text\": \"\\\"You should know women's sports are a joke\\\"\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "", "dataset_name": "ethos"}, "multilabel": {"config_name": "multilabel", "sample_row": "{\"text\": \"\\\"You should know women's sports are a joke\\\"\", \"violence\": \"0\", \"directed_vs_generalized\": \"0\", \"gender\": \"1\", \"race\": \"0\", \"national_origin\": \"0\", \"disability\": \"0\", \"religion\": \"0\", \"sexual_orientation\": \"0\"}", "columns": ["text", "violence", "directed_vs_generalized", "gender", "race", "national_origin", "disability", "religion", "sexual_orientation"], "columns_mapping": {"text": "text", "violence": "violence", "directed_vs_generalized": "directed_vs_generalized", "gender": "gender", "race": "race", "national_origin": "national_origin", "disability": "disability", "religion": "religion", "sexual_orientation": "sexual_orientation"}, "dataset_description": "", "dataset_name": "ethos"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "Hate Speech Detection"], "is_gated": false}, "eu_regulatory_ir": {"dataset_name": "eu_regulatory_ir", "description": "EURegIR: Regulatory Compliance IR (EU/UK)", "downloads": 444, "configs": {"eu2uk": {"config_name": "eu2uk", "sample_row": "{\"document_id\": \"\\\"31977L0539\\\"\", \"publication_year\": \"\\\"1977\\\"\", \"text\": \"\\\"Council Directive 77/539/EEC of 28 June 1977 on t...\", \"relevant_documents\": \"[\\\"UKSI19801182\\\"]\"}", "columns": ["document_id", "publication_year", "text", "relevant_documents"], "columns_mapping": {"document_id": "document_id", "publication_year": "publication_year", "text": "text", "relevant_documents": "relevant_documents"}, "dataset_description": "EURegIR: Regulatory Compliance IR (EU/UK)\n", "dataset_name": "eu_regulatory_ir"}, "uk2eu": {"config_name": "uk2eu", "sample_row": "{\"document_id\": \"\\\"UKPGA19700044\\\"\", \"publication_year\": \"\\\"1970\\\"\", \"text\": \"\\\"Chronically Sick and Disabled Persons Act 1970\\\\n\\\\...\", \"relevant_documents\": \"[\\\"32001L0055\\\"]\"}", "columns": ["document_id", "publication_year", "text", "relevant_documents"], "columns_mapping": {"document_id": "document_id", "publication_year": "publication_year", "text": "text", "relevant_documents": "relevant_documents"}, "dataset_description": "EURegIR: Regulatory Compliance IR (EU/UK)\n", "dataset_name": "eu_regulatory_ir"}}, "tags": ["task_categories:text-retrieval", "task_ids:document-retrieval", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "document-to-document-retrieval"], "is_gated": false}, "eurlex": {"dataset_name": "eurlex", "description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.", "downloads": 417, "configs": {"eurlex57k": {"config_name": "eurlex57k", "sample_row": "{\"celex_id\": \"\\\"32014R0727\\\"\", \"title\": \"\\\"Commission Implementing Regulation (EU) No 727/20...\", \"text\": \"\\\"1.7.2014 EN Official Journal of the European Unio...\", \"eurovoc_concepts\": \"[\\\"1402\\\", \\\"2771\\\", \\\"3191\\\", \\\"5055\\\", \\\"519\\\", \\\"5969\\\", \\\"5...\"}", "columns": ["celex_id", "title", "text", "eurovoc_concepts"], "columns_mapping": {"celex_id": "celex_id", "title": "title", "text": "text", "eurovoc_concepts": "eurovoc_concepts"}, "dataset_description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.\n", "dataset_name": "eurlex"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "legal-topic-classification"], "is_gated": false}, "euronews": {"dataset_name": "euronews", "description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.", "downloads": 965, "configs": {"fr-bnf": {"config_name": "fr-bnf", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Emmanuel\\\", \\\"DESOLES\\\", \\\"de\\\", \\\"LOU\\\", \\\"Directeur\\\", ...\", \"ner_tags\": \"[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 0, 0, 6...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "nl-kb": {"config_name": "nl-kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Indien\\\", \\\"men\\\", \\\"Itali\\\\u00eb\\\", \\\"in\\\", \\\"zijn\\\", \\\"ge...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-sbb": {"config_name": "de-sbb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Donnerstag\\\", \\\",\\\", \\\"1\\\", \\\".\\\", \\\"Januar\\\", \\\".\\\", \\\"Kam/...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-onb": {"config_name": "de-onb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"November\\\", \\\"Heute\\\", \\\"/\\\", \\\"als\\\", \\\"am\\\", \\\"Fest\\\", \\\"V...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-lft": {"config_name": "de-lft", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Eintracht\\\", \\\",\\\", \\\"die\\\", \\\"nicht\\\", \\\"nur\\\", \\\"ideal\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:fr", "language:nl"], "is_gated": false}, "europa_eac_tm": {"dataset_name": "europa_eac_tm", "description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.", "downloads": 584, "configs": {"en2bg": {"config_name": "en2bg", "sample_row": "{\"translation.en\": \"\\\"APPLICANT\\\"\", \"translation.bg\": \"\\\"\\\\u041a\\\\u0410\\\\u041d\\\\u0414\\\\u0418\\\\u0414\\\\u0410\\\\u0422\\\"...\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_bg", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.bg": "translation_bg", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}, "en2es": {"config_name": "en2es", "sample_row": "{\"translation.en\": \"\\\"Nr. teachers/trainers\\\"\", \"translation.es\": \"\\\"N\\\\u00famero de profesores/formadores\\\"\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_es", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}, "en2fr": {"config_name": "en2fr", "sample_row": "{\"translation.en\": \"\\\"Nr. teachers/trainers\\\"\", \"translation.fr\": \"\\\"Nb enseignants/formateurs\\\"\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_fr", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hr", "language:hu", "language:is", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv", "language:tr"], "is_gated": false}, "europa_ecdc_tm": {"dataset_name": "europa_ecdc_tm", "description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "downloads": 571, "configs": {"en2bg": {"config_name": "en2bg", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.bg\": \"\\\"\\\\u0417\\\\u0430\\\\u0441\\\\u0435\\\\u0433\\\\u0430 \\\\u043d\\\\u044f...\"}", "columns": ["translation_en", "translation_bg"], "columns_mapping": {"translation.en": "translation_en", "translation.bg": "translation_bg"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}, "en2fr": {"config_name": "en2fr", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.fr\": \"\\\"Aucune vaccination contre l\\\\u2019h\\\\u00e9patite C ...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}, "en2sl": {"config_name": "en2sl", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.sl\": \"\\\"Cepiva proti hepatitisu C \\\\u0161e ni.\\\"\"}", "columns": ["translation_en", "translation_sl"], "columns_mapping": {"translation.en": "translation_en", "translation.sl": "translation_sl"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:ga", "language:hu", "language:is", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "europarl_bilingual": {"dataset_name": "europarl_bilingual", "description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.", "downloads": 990, "configs": {"bg-cs": {"config_name": "bg-cs", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.cs\": \"\\\"Slo\\\\u017een\\\\u00ed Parlamentu: viz z\\\\u00e1pis\\\"\"}", "columns": ["translation_bg", "translation_cs"], "columns_mapping": {"translation.bg": "translation_bg", "translation.cs": "translation_cs"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-da": {"config_name": "bg-da", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.da\": \"\\\"Parlamentets sammens\\\\u00e6tning: se protokollen\\\"...\"}", "columns": ["translation_bg", "translation_da"], "columns_mapping": {"translation.bg": "translation_bg", "translation.da": "translation_da"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-de": {"config_name": "bg-de", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.de\": \"\\\"Zusammensetzung des Parlaments: siehe Protokoll\\\"...\"}", "columns": ["translation_bg", "translation_de"], "columns_mapping": {"translation.bg": "translation_bg", "translation.de": "translation_de"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-el": {"config_name": "bg-el", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.el\": \"\\\"\\\\u03a3\\\\u03cd\\\\u03bd\\\\u03b8\\\\u03b5\\\\u03c3\\\\u03b7 \\\\u03c4...\"}", "columns": ["translation_bg", "translation_el"], "columns_mapping": {"translation.bg": "translation_bg", "translation.el": "translation_el"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-en": {"config_name": "bg-en", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.en\": \"\\\"Membership of Parliament: see Minutes\\\"\"}", "columns": ["translation_bg", "translation_en"], "columns_mapping": {"translation.bg": "translation_bg", "translation.en": "translation_en"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "event2Mind": {"dataset_name": "event2Mind", "description": "In Event2Mind, we explore the task of understanding stereotypical intents and reactions to events. Through crowdsourcing, we create a large corpus with 25,000 events and free-form descriptions of their intents and reactions, both of the event's subject and (potentially implied) other participants.", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"Source\": \"\\\"it_events\\\"\", \"Event\": \"\\\"It is PersonY's favorite color\\\"\", \"Xintent\": \"\\\"[\\\\\\\"none\\\\\\\"]\\\"\", \"Xemotion\": \"\\\"[\\\\\\\"none\\\\\\\"]\\\"\", \"Otheremotion\": \"\\\"[\\\\\\\"happy\\\\\\\"]\\\"\", \"Xsent\": \"\\\"\\\"\", \"Osent\": \"\\\"4.0\\\"\"}", "columns": ["Source", "Event", "Xintent", "Xemotion", "Otheremotion", "Xsent", "Osent"], "columns_mapping": {"Source": "Source", "Event": "Event", "Xintent": "Xintent", "Xemotion": "Xemotion", "Otheremotion": "Otheremotion", "Xsent": "Xsent", "Osent": "Osent"}, "dataset_description": "In Event2Mind, we explore the task of understanding stereotypical intents and reactions to events. Through crowdsourcing, we create a large corpus with 25,000 events and free-form descriptions of their intents and reactions, both of the event's subject and (potentially implied) other participants.\n", "dataset_name": "event2Mind"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "common-sense-inference"], "is_gated": false}, "factckbr": {"dataset_name": "factckbr", "description": "A dataset to study Fake News in Portuguese, presenting a supposedly false News along with their respective fact check and classification.\nThe data is collected from the ClaimReview, a structured data schema used by fact check agencies to share their results in search engines, enabling data collect in real time.\nThe FACTCK.BR dataset contains 1309 claims with its corresponding label.", "downloads": 292, "configs": {"default": {"config_name": "default", "sample_row": "{\"url\": \"\\\"https://aosfatos.org/noticias/governo-bolsonaro-n...\", \"author\": \"\\\"https:www.aosfatos.org\\\"\", \"date\": \"\\\"2019-07-22\\\"\", \"claim\": \"\\\"Espa\\\\u00e7o dedicado para os eleitores do Bolsona...\", \"review\": \"\\\"Publica\\\\u00e7\\\\u00f5es que circulam nas redes soci...\", \"title\": \"\\\"Governo Bolsonaro n\\\\u00e3o suspendeu distribui\\\\u0...\", \"rating\": \"1.0\", \"best_rating\": \"5.0\", \"label\": \"0\"}", "columns": ["url", "author", "date", "claim", "review", "title", "rating", "best_rating", "label"], "columns_mapping": {"url": "url", "author": "author", "date": "date", "claim": "claim", "review": "review", "title": "title", "rating": "rating", "best_rating": "best_rating", "label": "label"}, "dataset_description": "A dataset to study Fake News in Portuguese, presenting a supposedly false News along with their respective fact check and classification.\nThe data is collected from the ClaimReview, a structured data schema used by fact check agencies to share their results in search engines, enabling data collect in real time.\nThe FACTCK.BR dataset contains 1309 claims with its corresponding label.\n", "dataset_name": "factckbr"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "fake_news_english": {"dataset_name": "fake_news_english", "description": "Fake news has become a major societal issue and a technical challenge for social media companies to identify. This content is difficult to identify because the term \"fake news\" covers intentionally false, deceptive stories as well as factual errors, satire, and sometimes, stories that a person just does not like. Addressing the problem requires clear definitions and examples. In this work, we present a dataset of fake news and satire stories that are hand coded, verified, and, in the case of fake news, include rebutting stories. We also include a thematic content analysis of the articles, identifying major themes that include hyperbolic support or condemnation of a gure, conspiracy theories, racist themes, and discrediting of reliable sources. In addition to releasing this dataset for research use, we analyze it and show results based on language that are promising for classification purposes. Overall, our contribution of a dataset and initial analysis are designed to support future work by fake news researchers.", "downloads": 346, "configs": {"default": {"config_name": "default", "sample_row": "{\"article_number\": \"375\", \"url_of_article\": \"\\\"http://www.redflagnews.com/headlines-2016/cdc-pro...\", \"fake_or_satire\": \"1\", \"url_of_rebutting_article\": \"\\\"http://www.snopes.com/cdc-forced-vaccinations/\\\"...\"}", "columns": ["article_number", "url_of_article", "fake_or_satire", "url_of_rebutting_article"], "columns_mapping": {"article_number": "article_number", "url_of_article": "url_of_article", "fake_or_satire": "fake_or_satire", "url_of_rebutting_article": "url_of_rebutting_article"}, "dataset_description": "\nFake news has become a major societal issue and a technical challenge for social media companies to identify. This content is difficult to identify because the term \"fake news\" covers intentionally false, deceptive stories as well as factual errors, satire, and sometimes, stories that a person just does not like. Addressing the problem requires clear definitions and examples. In this work, we present a dataset of fake news and satire stories that are hand coded, verified, and, in the case of fake news, include rebutting stories. We also include a thematic content analysis of the articles, identifying major themes that include hyperbolic support or condemnation of a gure, conspiracy theories, racist themes, and discrediting of reliable sources. In addition to releasing this dataset for research use, we analyze it and show results based on language that are promising for classification purposes. Overall, our contribution of a dataset and initial analysis are designed to support future work by fake news researchers.\n", "dataset_name": "fake_news_english"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "fake_news_filipino": {"dataset_name": "fake_news_filipino", "description": " Low-Resource Fake News Detection Corpora in Filipino. The first of its kind. Contains 3,206 expertly-labeled news samples, half of which are real and half of which are fake.", "downloads": 396, "configs": {"default": {"config_name": "default", "sample_row": "{\"label\": \"0\", \"article\": \"\\\"Ayon sa TheWrap.com, naghain ng kaso si Krupa, 35...\"}", "columns": ["label", "article"], "columns_mapping": {"label": "label", "article": "article"}, "dataset_description": " Low-Resource Fake News Detection Corpora in Filipino. The first of its kind. Contains 3,206 expertly-labeled news samples, half of which are real and half of which are fake.\n", "dataset_name": "fake_news_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "financial_phrasebank": {"dataset_name": "financial_phrasebank", "description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.", "downloads": 9147, "configs": {"sentences_allagree": {"config_name": "sentences_allagree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_75agree": {"config_name": "sentences_75agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_66agree": {"config_name": "sentences_66agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_50agree": {"config_name": "sentences_50agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "finance"], "is_gated": false}, "finer": {"dataset_name": "finer", "description": "The directory data contains a corpus of Finnish technology related news articles with a manually prepared\nnamed entity annotation (digitoday.2014.csv). The text material was extracted from the archives of Digitoday,\na Finnish online technology news source (www.digitoday.fi). The corpus consists of 953 articles\n(193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date).\nThe corpus is available for research purposes and can be readily used for development of NER systems for Finnish.", "downloads": 289, "configs": {"finer": {"config_name": "finer", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Imperiumi\\\", \\\"laajenee\\\", \\\":\\\", \\\"Maailman\\\", \\\"suurin...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0]\", \"nested_ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "nested_ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "nested_ner_tags": "nested_ner_tags"}, "dataset_description": "The directory data contains a corpus of Finnish technology related news articles with a manually prepared\nnamed entity annotation (digitoday.2014.csv). The text material was extracted from the archives of Digitoday,\na Finnish online technology news source (www.digitoday.fi). The corpus consists of 953 articles\n(193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date).\nThe corpus is available for research purposes and can be readily used for development of NER systems for Finnish.\n", "dataset_name": "finer"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:fi"], "is_gated": false}, "freebase_qa": {"dataset_name": "freebase_qa", "description": "FreebaseQA is for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase The data set is generated by matching trivia-type question-answer pairs with subject-predicateobject triples in Freebase.", "downloads": 487, "configs": {"default": {"config_name": "default", "sample_row": "{\"Question-ID\": \"\\\"FreebaseQA-train-0\\\"\", \"RawQuestion\": \"\\\"What was Pierce Brosnan's first outing as 007?\\\"...\", \"ProcessedQuestion\": \"\\\"what was pierce brosnan's first outing as 007\\\"\", \"Parses.Parse-Id\": \"[\\\"FreebaseQA-train-0.P0\\\", \\\"FreebaseQA-train-0.P1\\\"]...\", \"Parses.PotentialTopicEntityMention\": \"[\\\"007\\\", \\\"pierce brosnan\\\"]\", \"Parses.TopicEntityName\": \"[\\\"james bond\\\", \\\"pierce brosnan\\\"]\", \"Parses.TopicEntityMid\": \"[\\\"m.0clpml\\\", \\\"m.018p4y\\\"]\", \"Parses.InferentialChain\": \"[\\\"film.film_character.portrayed_in_films..film.per...\", \"Parses.Answers\": \"[{\\\"AnswersMid\\\": [\\\"m.01npcx\\\"], \\\"AnswersName\\\": [[\\\"go...\"}", "columns": ["Question-ID", "RawQuestion", "ProcessedQuestion", "Parses_Parse-Id", "Parses_PotentialTopicEntityMention", "Parses_TopicEntityName", "Parses_TopicEntityMid", "Parses_InferentialChain", "Parses_Answers"], "columns_mapping": {"Question-ID": "Question-ID", "RawQuestion": "RawQuestion", "ProcessedQuestion": "ProcessedQuestion", "Parses.Parse-Id": "Parses_Parse-Id", "Parses.PotentialTopicEntityMention": "Parses_PotentialTopicEntityMention", "Parses.TopicEntityName": "Parses_TopicEntityName", "Parses.TopicEntityMid": "Parses_TopicEntityMid", "Parses.InferentialChain": "Parses_InferentialChain", "Parses.Answers": "Parses_Answers"}, "dataset_description": "FreebaseQA is for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase The data set is generated by matching trivia-type question-answer pairs with subject-predicateobject triples in Freebase.\n", "dataset_name": "freebase_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|trivia_qa", "language:en"], "is_gated": false}, "gap": {"dataset_name": "gap", "description": "GAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.", "downloads": 328, "configs": {"default": {"config_name": "default", "sample_row": "{\"ID\": \"\\\"development-1\\\"\", \"Text\": \"\\\"Zoe Telford -- played the police officer girlfrie...\", \"Pronoun\": \"\\\"her\\\"\", \"Pronoun-offset\": \"274\", \"A\": \"\\\"Cheryl Cassidy\\\"\", \"A-offset\": \"191\", \"A-coref\": \"true\", \"B\": \"\\\"Pauline\\\"\", \"B-offset\": \"207\", \"B-coref\": \"false\", \"URL\": \"\\\"http://en.wikipedia.org/wiki/List_of_Teachers_(UK...\"}", "columns": ["ID", "Text", "Pronoun", "Pronoun-offset", "A", "A-offset", "A-coref", "B", "B-offset", "B-coref", "URL"], "columns_mapping": {"ID": "ID", "Text": "Text", "Pronoun": "Pronoun", "Pronoun-offset": "Pronoun-offset", "A": "A", "A-offset": "A-offset", "A-coref": "A-coref", "B": "B", "B-offset": "B-offset", "B-coref": "B-coref", "URL": "URL"}, "dataset_description": "\nGAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.\n", "dataset_name": "gap"}}, "tags": ["task_categories:token-classification", "task_ids:coreference-resolution", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "generics_kb": {"dataset_name": "generics_kb", "description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.", "downloads": 945, "configs": {"generics_kb_best": {"config_name": "generics_kb_best", "sample_row": "{\"source\": \"\\\"Waterloo\\\"\", \"term\": \"\\\"aa battery\\\"\", \"quantifier_frequency\": \"\\\"\\\"\", \"quantifier_number\": \"\\\"\\\"\", \"generic_sentence\": \"\\\"AA batteries maintain the settings if the power e...\", \"score\": \"0.35092294216156006\"}", "columns": ["source", "term", "quantifier_frequency", "quantifier_number", "generic_sentence", "score"], "columns_mapping": {"source": "source", "term": "term", "quantifier_frequency": "quantifier_frequency", "quantifier_number": "quantifier_number", "generic_sentence": "generic_sentence", "score": "score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb": {"config_name": "generics_kb", "sample_row": "{\"source\": \"\\\"Waterloo\\\"\", \"term\": \"\\\"a.active replication\\\"\", \"quantifier_frequency\": \"\\\"\\\"\", \"quantifier_number\": \"\\\"\\\"\", \"generic_sentence\": \"\\\"A.Active replication requires all members to exec...\", \"score\": \"0.024261321872472763\"}", "columns": ["source", "term", "quantifier_frequency", "quantifier_number", "generic_sentence", "score"], "columns_mapping": {"source": "source", "term": "term", "quantifier_frequency": "quantifier_frequency", "quantifier_number": "quantifier_number", "generic_sentence": "generic_sentence", "score": "score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb_simplewiki": {"config_name": "generics_kb_simplewiki", "sample_row": "{\"source_name\": \"\\\"SimpleWikipedia\\\"\", \"sentence\": \"\\\"Sepsis happens when the bacterium enters the bloo...\", \"sentences_before\": \"[]\", \"sentences_after\": \"[]\", \"concept_name\": \"\\\"sepsis\\\"\", \"quantifiers\": \"[]\", \"id\": \"\\\"SimpleWikipedia--tmp-sw-rs1-with-bug-fixes-initia...\", \"bert_score\": \"0.8396177887916565\", \"headings\": \"[\\\"Bubonic plague\\\", \\\"Different kinds of the same di...\", \"categories\": \"[\\\"Diseases caused by bacteria\\\", \\\"Pulmonology\\\"]\"}", "columns": ["source_name", "sentence", "sentences_before", "sentences_after", "concept_name", "quantifiers", "id", "bert_score", "headings", "categories"], "columns_mapping": {"source_name": "source_name", "sentence": "sentence", "sentences_before": "sentences_before", "sentences_after": "sentences_after", "concept_name": "concept_name", "quantifiers": "quantifiers", "id": "id", "bert_score": "bert_score", "headings": "headings", "categories": "categories"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb_waterloo": {"config_name": "generics_kb_waterloo", "sample_row": "{\"source_name\": \"\\\"Waterloo\\\"\", \"sentence\": \"\\\"Businesses can also survive by marketing to non-l...\", \"sentences_before\": \"[\\\"The low population also means that there are not...\", \"sentences_after\": \"[\\\"Our town covers an area about 8 blocks long, by ...\", \"concept_name\": \"\\\"business\\\"\", \"quantifiers\": \"[]\", \"id\": \"\\\"Waterloo-sbhaktha-waterloo-clean-node10-of-38-par...\", \"bert_score\": \"0.1443023681640625\"}", "columns": ["source_name", "sentence", "sentences_before", "sentences_after", "concept_name", "quantifiers", "id", "bert_score"], "columns_mapping": {"source_name": "source_name", "sentence": "sentence", "sentences_before": "sentences_before", "sentences_after": "sentences_after", "concept_name": "concept_name", "quantifiers": "quantifiers", "id": "id", "bert_score": "bert_score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}}, "tags": ["task_categories:other", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base"], "is_gated": false}, "germaner": {"dataset_name": "germaner", "description": "GermaNER is a freely available statistical German Named Entity Tagger based on conditional random fields(CRF). The tagger is trained and evaluated on the NoSta-D Named Entity dataset, which was used in the GermEval 2014 for named entity recognition. The tagger comes close to the performance of the best (proprietary) system in the competition with 77% F-measure (this is the latest result; the one reported in the paper is 76%) test set performance on the four standard NER classes (PERson, LOCation, ORGanisation and OTHer).\n\nWe describe a range of features and their influence on German NER classification and provide a comparative evaluation and some analysis of the results. The software components, the training data and all data used for feature generation are distributed under permissive licenses, thus this tagger can be used in academic and commercial settings without restrictions or fees. The tagger is available as a command-line tool and as an Apache UIMA component.", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Schartau\\\", \\\"sagte\\\", \\\"dem\\\", \\\"\\\\\\\"\\\", \\\"Tagesspiegel\\\",...\", \"ner_tags\": \"[3, 8, 8, 8, 1, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "GermaNER is a freely available statistical German Named Entity Tagger based on conditional random fields(CRF). The tagger is trained and evaluated on the NoSta-D Named Entity dataset, which was used in the GermEval 2014 for named entity recognition. The tagger comes close to the performance of the best (proprietary) system in the competition with 77% F-measure (this is the latest result; the one reported in the paper is 76%) test set performance on the four standard NER classes (PERson, LOCation, ORGanisation and OTHer).\n\nWe describe a range of features and their influence on German NER classification and provide a comparative evaluation and some analysis of the results. The software components, the training data and all data used for feature generation are distributed under permissive licenses, thus this tagger can be used in academic and commercial settings without restrictions or fees. The tagger is available as a command-line tool and as an Apache UIMA component.\n", "dataset_name": "germaner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:de"], "is_gated": false}, "giga_fren": {"dataset_name": "giga_fren", "description": "Giga-word corpus for French-English from WMT2010 collected by Chris Callison-Burch\n2 languages, total number of files: 452\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 47.55M", "downloads": 291, "configs": {"en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Changing Lives _BAR_ Changing Society _BAR_ How I...\", \"translation.fr\": \"\\\"Il a transform\\\\u00e9 notre vie _BAR_ Il a transfo...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "Giga-word corpus for French-English from WMT2010 collected by Chris Callison-Burch\n2 languages, total number of files: 452\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 47.55M\n", "dataset_name": "giga_fren"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:fr"], "is_gated": false}, "gnad10": {"dataset_name": "gnad10", "description": "This dataset is intended to advance topic classification for German texts. A classifier that is efffective in\nEnglish may not be effective in German dataset because it has a higher inflection and longer compound words.\nThe 10kGNAD dataset contains 10273 German news articles from an Austrian online newspaper categorized into\n9 categories. Article titles and text are concatenated together and authors are removed to avoid a keyword-like\nclassification on authors that write frequently about one category. This dataset can be used as a benchmark\nfor German topic classification.", "downloads": 532, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"21-J\\\\u00e4hriger f\\\\u00e4llt wohl bis Saisonende a...\", \"label\": \"4\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "This dataset is intended to advance topic classification for German texts. A classifier that is efffective in\nEnglish may not be effective in German dataset because it has a higher inflection and longer compound words.\nThe 10kGNAD dataset contains 10273 German news articles from an Austrian online newspaper categorized into\n9 categories. Article titles and text are concatenated together and authors are removed to avoid a keyword-like\nclassification on authors that write frequently about one category. This dataset can be used as a benchmark\nfor German topic classification.\n", "dataset_name": "gnad10"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other-from-One-Million-Posts-Corpus", "language:de"], "is_gated": false}, "go_emotions": {"dataset_name": "go_emotions", "description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.", "downloads": 7166, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"text\": \"\\\"That game hurt.\\\"\", \"id\": \"\\\"eew5j0j\\\"\", \"author\": \"\\\"Brdd9\\\"\", \"subreddit\": \"\\\"nrl\\\"\", \"link_id\": \"\\\"t3_ajis4z\\\"\", \"parent_id\": \"\\\"t1_eew18eq\\\"\", \"created_utc\": \"1548381039.0\", \"rater_id\": \"1\", \"example_very_unclear\": \"false\", \"admiration\": \"0\", \"amusement\": \"0\", \"anger\": \"0\", \"annoyance\": \"0\", \"approval\": \"0\", \"caring\": \"0\", \"confusion\": \"0\", \"curiosity\": \"0\", \"desire\": \"0\", \"disappointment\": \"0\", \"disapproval\": \"0\", \"disgust\": \"0\", \"embarrassment\": \"0\", \"excitement\": \"0\", \"fear\": \"0\", \"gratitude\": \"0\", \"grief\": \"0\", \"joy\": \"0\", \"love\": \"0\", \"nervousness\": \"0\", \"optimism\": \"0\", \"pride\": \"0\", \"realization\": \"0\", \"relief\": \"0\", \"remorse\": \"0\", \"sadness\": \"1\", \"surprise\": \"0\", \"neutral\": \"0\"}", "columns": ["text", "id", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id", "example_very_unclear", "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"], "columns_mapping": {"text": "text", "id": "id", "author": "author", "subreddit": "subreddit", "link_id": "link_id", "parent_id": "parent_id", "created_utc": "created_utc", "rater_id": "rater_id", "example_very_unclear": "example_very_unclear", "admiration": "admiration", "amusement": "amusement", "anger": "anger", "annoyance": "annoyance", "approval": "approval", "caring": "caring", "confusion": "confusion", "curiosity": "curiosity", "desire": "desire", "disappointment": "disappointment", "disapproval": "disapproval", "disgust": "disgust", "embarrassment": "embarrassment", "excitement": "excitement", "fear": "fear", "gratitude": "gratitude", "grief": "grief", "joy": "joy", "love": "love", "nervousness": "nervousness", "optimism": "optimism", "pride": "pride", "realization": "realization", "relief": "relief", "remorse": "remorse", "sadness": "sadness", "surprise": "surprise", "neutral": "neutral"}, "dataset_description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.\n", "dataset_name": "go_emotions"}, "simplified": {"config_name": "simplified", "sample_row": "{\"text\": \"\\\"My favourite food is anything I didn't have to co...\", \"labels\": \"[27]\", \"id\": \"\\\"eebbqej\\\"\"}", "columns": ["text", "labels", "id"], "columns_mapping": {"text": "text", "labels": "labels", "id": "id"}, "dataset_description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.\n", "dataset_name": "go_emotions"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "emotion"], "is_gated": false}, "google_wellformed_query": {"dataset_name": "google_wellformed_query", "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.", "downloads": 486, "configs": {"default": {"config_name": "default", "sample_row": "{\"rating\": \"0.2\", \"content\": \"\\\"The European Union includes how many ?\\\"\"}", "columns": ["rating", "content"], "columns_mapping": {"rating": "rating", "content": "content"}, "dataset_description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n", "dataset_name": "google_wellformed_query"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended", "language:en"], "is_gated": false}, "grail_qa": {"dataset_name": "grail_qa", "description": "Strongly Generalizable Question Answering (GrailQA) is a new large-scale, high-quality dataset for question answering on knowledge bases (KBQA) on Freebase with 64,331 questions annotated with both answers and corresponding logical forms in different syntax (i.e., SPARQL, S-expression, etc.). It can be used to test three levels of generalization in KBQA: i.i.d., compositional, and zero-shot.", "downloads": 378, "configs": {"default": {"config_name": "default", "sample_row": "{\"qid\": \"\\\"2101535001000\\\"\", \"question\": \"\\\"oxybutynin chloride 5 extended release film coate...\", \"answer.answer_type\": \"[\\\"Entity\\\", \\\"Entity\\\"]\", \"answer.answer_argument\": \"[\\\"m.0z3xfvs\\\", \\\"m.0z3xm0m\\\"]\", \"answer.entity_name\": \"[\\\"Oxybutynin Oral\\\", \\\"Oxybutynin Chloride Oral\\\"]\", \"function\": \"\\\"none\\\"\", \"num_node\": \"2\", \"num_edge\": \"1\", \"graph_query.nodes.nid\": \"[0, 1]\", \"graph_query.nodes.node_type\": \"[\\\"class\\\", \\\"entity\\\"]\", \"graph_query.nodes.id\": \"[\\\"medicine.routed_drug\\\", \\\"m.0hqs1x_\\\"]\", \"graph_query.nodes.class\": \"[\\\"medicine.routed_drug\\\", \\\"medicine.drug_formulatio...\", \"graph_query.nodes.friendly_name\": \"[\\\"Routed drug\\\", \\\"Oxybutynin chloride 5 extended re...\", \"graph_query.nodes.question_node\": \"[1, 0]\", \"graph_query.nodes.function\": \"[\\\"none\\\", \\\"none\\\"]\", \"graph_query.edges.start\": \"[0]\", \"graph_query.edges.end\": \"[1]\", \"graph_query.edges.relation\": \"[\\\"medicine.routed_drug.marketed_formulations\\\"]\", \"graph_query.edges.friendly_name\": \"[\\\"Marketed formulations\\\"]\", \"sparql_query\": \"\\\"PREFIX rdf: 894be9b4...\", \"unique_id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4...\", \"excerpt_index\": \"11\"}", "columns": ["source", "citeStart", "sectionName", "string", "citeEnd", "label", "label_confidence", "label2", "label2_confidence", "citingPaperId", "citedPaperId", "isKeyCitation", "id", "unique_id", "excerpt_index"], "columns_mapping": {"source": "source", "citeStart": "citeStart", "sectionName": "sectionName", "string": "string", "citeEnd": "citeEnd", "label": "label", "label_confidence": "label_confidence", "label2": "label2", "label2_confidence": "label2_confidence", "citingPaperId": "citingPaperId", "citedPaperId": "citedPaperId", "isKeyCitation": "isKeyCitation", "id": "id", "unique_id": "unique_id", "excerpt_index": "excerpt_index"}, "dataset_description": "SciCite is a dataset of 11K manually annotated citation intents based on\ncitation context in the computer science and biomedical domains.\n", "dataset_name": "bigbio/scicite"}, "scicite_bigbio_text": {"config_name": "scicite_bigbio_text", "sample_row": "{\"id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4...\", \"document_id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649\\\"\", \"text\": \"\\\"However, how frataxin interacts with the Fe-S clu...\", \"labels\": \"[\\\"background\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "SciCite is a dataset of 11K manually annotated citation intents based on\ncitation context in the computer science and biomedical domains.\n", "dataset_name": "bigbio/scicite"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/tmvar_v1": {"dataset_name": "bigbio/tmvar_v1", "description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "downloads": 50, "configs": {"tmvar_v1_source": {"config_name": "tmvar_v1_source", "sample_row": "{\"pmid\": \"\\\"22016685\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": \\\"A novel missense mutat...\", \"entities\": \"[{\\\"offsets\\\": [26, 35], \\\"text\\\": \\\"Asp506Gly\\\", \\\"seman...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "dataset_name": "bigbio/tmvar_v1"}, "tmvar_v1_bigbio_kb": {"config_name": "tmvar_v1_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"22016685\\\"\", \"passages\": \"[{\\\"id\\\": \\\"5\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"A novel mi...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"ProteinMutation\\\", \\\"text\\\": [\\\"...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "dataset_name": "bigbio/tmvar_v1"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/tmvar_v2": {"dataset_name": "bigbio/tmvar_v2", "description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "downloads": 63, "configs": {"tmvar_v2_source": {"config_name": "tmvar_v2_source", "sample_row": "{\"pmid\": \"\\\"22051099\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": \\\"Variation in the CXCR1...\", \"entities\": \"[{\\\"offsets\\\": [327, 336], \\\"text\\\": \\\"rs2234671\\\", \\\"sem...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "dataset_name": "bigbio/tmvar_v2"}, "tmvar_v2_bigbio_kb": {"config_name": "tmvar_v2_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"22051099\\\"\", \"passages\": \"[{\\\"id\\\": \\\"6\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"Variation ...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"SNP\\\", \\\"text\\\": [\\\"rs2234671\\\"],...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "dataset_name": "bigbio/tmvar_v2"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "PlanTL-GOB-ES/sts-es": {"dataset_name": "PlanTL-GOB-ES/sts-es", "description": "For Semantic Text Similarity, we collected the Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015). Since no training data was provided for the Spanish subtask, we randomly sampled both datasets into 1,321 sentences for the train set, 78 sentences for the development set, and 156 sentences for the test set. To make the task harder for the models, we purposely made the development set smaller than the test set.", "downloads": 63, "configs": {"STS": {"config_name": "STS", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence1\": \"\\\"Seg\\\\u00fan el sondeo, 87% de los cat\\\\u00f3licos c...\", \"sentence2\": \"\\\"El 87% de los cat\\\\u00f3licos del mundo aprobaron ...\", \"label\": \"3.75\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\nFor Semantic Text Similarity, we collected the Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015). Since no training data was provided for the Spanish subtask, we randomly sampled both datasets into 1,321 sentences for the train set, 78 sentences for the development set, and 156 sentences for the test set. To make the task harder for the models, we purposely made the development set smaller than the test set.\n", "dataset_name": "PlanTL-GOB-ES/sts-es"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:es"], "is_gated": false}, "PlanTL-GOB-ES/WikiCAT_esv2": {"dataset_name": "PlanTL-GOB-ES/WikiCAT_esv2", "description": "WikiCAT: Text Classification Spanish dataset from the Viquipedia", "downloads": 47, "configs": {"wikiCAT_es": {"config_name": "wikiCAT_es", "sample_row": "{\"text\": \"\\\"En estad\\\\u00edstica, un modelo probit es un tipo ...\", \"label\": \"5\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "\n WikiCAT: Text Classification Spanish dataset from the Viquipedia\n\n ", "dataset_name": "PlanTL-GOB-ES/WikiCAT_esv2"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:automatically-generated", "multilinguality:monolingual", "language:es"], "is_gated": false}, "jeanlee/kmhas_korean_hate_speech": {"dataset_name": "jeanlee/kmhas_korean_hate_speech", "description": "The K-MHaS (Korean Multi-label Hate Speech) dataset contains 109k utterances from Korean online news comments labeled with 8 fine-grained hate speech classes or Not Hate Speech class.\nThe fine-grained hate speech classes are politics, origin, physical, age, gender, religion, race, and profanity and these categories are selected in order to reflect the social and historical context.", "downloads": 555, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"\\\\\\\"\\\\uc790\\\\ud55c\\\\ub2f9\\\\ud2c0\\\\ub531\\\\ub4e4.. \\\\uc545\\\\u...\", \"label\": \"[2, 4]\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The K-MHaS (Korean Multi-label Hate Speech) dataset contains 109k utterances from Korean online news comments labeled with 8 fine-grained hate speech classes or Not Hate Speech class.\nThe fine-grained hate speech classes are politics, origin, physical, age, gender, religion, race, and profanity and these categories are selected in order to reflect the social and historical context.\n", "dataset_name": "jeanlee/kmhas_korean_hate_speech"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:hate-speech-detection", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ko", "K-MHaS", "Korean NLP", "Hate Speech Detection", "Dataset", "Coling2022"], "is_gated": false}, "gsarti/mt_geneval": {"dataset_name": "gsarti/mt_geneval", "description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.", "downloads": 41, "configs": {"sentences_en_ar": {"config_name": "sentences_en_ar", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"At 12 years old, she became an assistant stick gi...\", \"reference_feminine\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_masculine\": \"\\\"At 12 years old, he became an assistant stick boy...\", \"reference_masculine\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_feminine_annotated\": \"\\\"At 12 years old, she became an assistant s...\", \"reference_feminine_annotated\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_masculine_annotated\": \"\\\"At 12 years old, he became an assistant st...\", \"reference_masculine_annotated\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_feminine_keywords\": \"\\\"she;girl\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0623\\\\u0635\\\\u0628\\\\u062d\\\\u062a \\\\u062d\\\\u0627\\\\u0645...\", \"source_masculine_keywords\": \"\\\"he;boy\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0623\\\\u0635\\\\u0628\\\\u062d \\\\u062d\\\\u0627\\\\u0645\\\\u0644...\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_ar": {"config_name": "context_en_ar", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He continued to gain recognition as an applied ar...\", \"source\": \"\\\"After these wins, Brodovitch's career as an appli...\", \"reference_original\": \"\\\"\\\\u0628\\\\u0639\\\\u062f \\\\u0647\\\\u0630\\\\u0647 \\\\u0627\\\\u064...\", \"reference_flipped\": \"\\\"\\\\u0628\\\\u0639\\\\u062f \\\\u0647\\\\u0630\\\\u0647 \\\\u0627\\\\u064...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_fr": {"config_name": "sentences_en_fr", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Career She started her singing career in 2003 wit...\", \"reference_feminine\": \"\\\"Carri\\\\u00e8re Elle d\\\\u00e9buta sa carri\\\\u00e8re d...\", \"source_masculine\": \"\\\"Career He started his singing career in 2003 with...\", \"reference_masculine\": \"\\\"Carri\\\\u00e8re Il d\\\\u00e9buta sa carri\\\\u00e8re de ...\", \"source_feminine_annotated\": \"\\\"Career She started her singing care...\", \"reference_feminine_annotated\": \"\\\"Carri\\\\u00e8re Elle d\\\\u00e9buta sa carri\\\\u0...\", \"source_masculine_annotated\": \"\\\"Career He started his singing caree...\", \"reference_masculine_annotated\": \"\\\"Carri\\\\u00e8re Il d\\\\u00e9buta sa carri\\\\u00e...\", \"source_feminine_keywords\": \"\\\"She;her;her;her\\\"\", \"reference_feminine_keywords\": \"\\\"Elle;chanteuse\\\"\", \"source_masculine_keywords\": \"\\\"He;his;his;his\\\"\", \"reference_masculine_keywords\": \"\\\"Il;chanteur\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_fr": {"config_name": "context_en_fr", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He then went to Sydney and then Melbourne holding...\", \"source\": \"\\\"Evergood was a capable artist, who mostly painted...\", \"reference_original\": \"\\\"Evergood \\\\u00e9tait un artiste comp\\\\u00e9tent, qu...\", \"reference_flipped\": \"\\\"Evergood \\\\u00e9tait une artiste comp\\\\u00e9tente, ...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_de": {"config_name": "sentences_en_de", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_feminine\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_masculine\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_masculine\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_feminine_annotated\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_feminine_annotated\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_masculine_annotated\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_masculine_annotated\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_feminine_keywords\": \"\\\"her\\\"\", \"reference_feminine_keywords\": \"\\\"ihr\\\"\", \"source_masculine_keywords\": \"\\\"him\\\"\", \"reference_masculine_keywords\": \"\\\"ihm\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_de": {"config_name": "context_en_de", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"As Professor of Painting at the Royal Academy he ...\", \"source\": \"\\\"Clausen was an official war artist during World W...\", \"reference_original\": \"\\\"Clausen war ein offizieller Kriegsk\\\\u00fcnstler w...\", \"reference_flipped\": \"\\\"Clausen war eine offizielle Kriegsk\\\\u00fcnstlerin...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_hi": {"config_name": "sentences_en_hi", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"In 1898, the empress authorized the creation of a...\", \"reference_feminine\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u093e\\\\u092e\\\\u094d...\", \"source_masculine\": \"\\\"In 1898, the emperor authorized the creation of a...\", \"reference_masculine\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u092e\\\\u094d\\\\u0930...\", \"source_feminine_annotated\": \"\\\"In 1898, the empress authorized the creati...\", \"reference_feminine_annotated\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u093e\\\\u092e\\\\u0...\", \"source_masculine_annotated\": \"\\\"In 1898, the emperor authorized the creati...\", \"reference_masculine_annotated\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u092e\\\\u094d\\\\u0...\", \"source_feminine_keywords\": \"\\\"empress\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0938\\\\u093e\\\\u092e\\\\u094d\\\\u0930\\\\u093e\\\\u091c\\\\u094d\\\\...\", \"source_masculine_keywords\": \"\\\"emperor\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0938\\\\u092e\\\\u094d\\\\u0930\\\\u093e\\\\u091f\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_hi": {"config_name": "context_en_hi", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"The story of GPA's downfall is told by Christophe...\", \"source\": \"\\\"It is based on a contemporaneous diary of events ...\", \"reference_original\": \"\\\"\\\\u092f\\\\u0939 1990 \\\\u0938\\\\u0947 1996 \\\\u0924\\\\u0915 ...\", \"reference_flipped\": \"\\\"\\\\u092f\\\\u0939 1990 \\\\u0938\\\\u0947 1996 \\\\u0924\\\\u0915 ...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_it": {"config_name": "sentences_en_it", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Pagratidis quickly recanted her confession, claim...\", \"reference_feminine\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_masculine\": \"\\\"Pagratidis quickly recanted his confession, claim...\", \"reference_masculine\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_feminine_annotated\": \"\\\"Pagratidis quickly recanted her confession...\", \"reference_feminine_annotated\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_masculine_annotated\": \"\\\"Pagratidis quickly recanted his confession...\", \"reference_masculine_annotated\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_feminine_keywords\": \"\\\"her;she;her;she;her\\\"\", \"reference_feminine_keywords\": \"\\\"stata picchiata;ferma\\\"\", \"source_masculine_keywords\": \"\\\"his;he;his;he;his\\\"\", \"reference_masculine_keywords\": \"\\\"stato picchiato;fermo\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_it": {"config_name": "context_en_it", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"Pierpont told of entering and holding up the bank...\", \"source\": \"\\\"However, Pierpont stated that Skeer was the plann...\", \"reference_original\": \"\\\"Comunque, Pierpont disse che Skeer era il pianifi...\", \"reference_flipped\": \"\\\"Comunque, Pierpont disse che Skeer era la pianifi...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_pt": {"config_name": "sentences_en_pt", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"In variants of Bluebeard, the wife's curiosity is...\", \"reference_feminine\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_masculine\": \"\\\"In variants of Bluebeard, the husband's curiosity...\", \"reference_masculine\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_feminine_annotated\": \"\\\"In variants of Bluebeard, the wife's curio...\", \"reference_feminine_annotated\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_masculine_annotated\": \"\\\"In variants of Bluebeard, the husband's cu...\", \"reference_masculine_annotated\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_feminine_keywords\": \"\\\"wife's;she\\\"\", \"reference_feminine_keywords\": \"\\\"da esposa;ela\\\"\", \"source_masculine_keywords\": \"\\\"husband's;he\\\"\", \"reference_masculine_keywords\": \"\\\"do marido;ele\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_pt": {"config_name": "context_en_pt", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He wrote a retrospective of his work and its cont...\", \"source\": \"\\\"Goguen was a practitioner of Tibetan Buddhism.\\\"...\", \"reference_original\": \"\\\"Goguen era um praticante do Budismo Tibetano.\\\"\", \"reference_flipped\": \"\\\"Goguen era uma praticante do Budismo Tibetano.\\\"...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_ru": {"config_name": "sentences_en_ru", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Mrs Duncan supported the idea, and government bod...\", \"reference_feminine\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430 \\\\u0414...\", \"source_masculine\": \"\\\"Mr Duncan supported the idea, and government bodi...\", \"reference_masculine\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u043d ...\", \"source_feminine_annotated\": \"\\\"Mrs Duncan supported the idea, and governm...\", \"reference_feminine_annotated\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430...\", \"source_masculine_annotated\": \"\\\"Mr Duncan supported the idea, and governme...\", \"reference_masculine_annotated\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u04...\", \"source_feminine_keywords\": \"\\\"Mrs\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430;\\\\u043f...\", \"source_masculine_keywords\": \"\\\"Mr\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u043d;...\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_ru": {"config_name": "context_en_ru", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"\\\\u201cDictators are stupid,\\\\u201d he noted, \\\\u201...\", \"source\": \"\\\"As a young artist in Baghdad in the 1980s, Alfraj...\", \"reference_original\": \"\\\"\\\\u0412 1980-\\\\u0445 \\\\u0433\\\\u043e\\\\u0434\\\\u0430\\\\u0445...\", \"reference_flipped\": \"\\\"\\\\u0412 1980-\\\\u0445 \\\\u0433\\\\u043e\\\\u0434\\\\u0430\\\\u0445...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_es": {"config_name": "sentences_en_es", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Morgan was asked by Fey to play the role, and she...\", \"reference_feminine\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_masculine\": \"\\\"Morgan was asked by Fey to play the role, and he ...\", \"reference_masculine\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_feminine_annotated\": \"\\\"Morgan was asked by Fey to play the role, and ...\", \"reference_feminine_annotated\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_masculine_annotated\": \"\\\"Morgan was asked by Fey to play the role, and ...\", \"reference_masculine_annotated\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_feminine_keywords\": \"\\\"she;her;her\\\"\", \"reference_feminine_keywords\": \"\\\"ella\\\"\", \"source_masculine_keywords\": \"\\\"he;his;him\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u00e9l\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_es": {"config_name": "context_en_es", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"In 1994\\\\u201395, he conducted a research project ...\", \"source\": \"\\\"Ritchin is a prolific author and curator, focusin...\", \"reference_original\": \"\\\"Ritchin es un autor y conservador prol\\\\u00edfico,...\", \"reference_flipped\": \"\\\"Ritchin es una autora y conservadora prol\\\\u00edfi...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:it", "language:fr", "language:ar", "language:de", "language:hi", "language:pt", "language:ru", "language:es", "gender", "constrained mt"], "is_gated": false}, "sagnikrayc/snli-cf-kaushik": {"dataset_name": "sagnikrayc/snli-cf-kaushik", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). In the ICLR 2020 paper [Learning the Difference that Makes a Difference with Counterfactually-Augmented Data](https://openreview.net/forum?id=Sklgs0NFvr), Kaushik et. al. provided a dataset with counterfactual perturbations on the SNLI and IMDB data. This repository contains the original and counterfactual perturbations for the SNLI data, which was generated after processing the original data from [here](https://github.com/acmi-lab/counterfactually-augmented-data).", "downloads": 15, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"idx\": \"\\\"3021531305.jpg#0r1n-orig\\\"\", \"premise\": \"\\\"A man is riding a red motorcycle with a small chi...\", \"hypothesis\": \"\\\"A man rides his motorcyle with his won.\\\"\", \"label\": \"\\\"neutral\\\"\", \"type\": \"\\\"original\\\"\"}", "columns": ["idx", "premise", "hypothesis", "label", "type"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label", "type": "type"}, "dataset_description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). In the ICLR 2020 paper [Learning the Difference that Makes a Difference with Counterfactually-Augmented Data](https://openreview.net/forum?id=Sklgs0NFvr), Kaushik et. al. provided a dataset with counterfactual perturbations on the SNLI and IMDB data. This repository contains the original and counterfactual perturbations for the SNLI data, which was generated after processing the original data from [here](https://github.com/acmi-lab/counterfactually-augmented-data).", "dataset_name": "sagnikrayc/snli-cf-kaushik"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|snli", "language:en"], "is_gated": false}, "vesteinn/swe-nerc": {"dataset_name": "vesteinn/swe-nerc", "description": "The corpus consists of ca. 150.000 words of text.", "downloads": 199, "configs": {"swe-nerc": {"config_name": "swe-nerc", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Det\\\", \\\"har\\\", \\\"iaf\\\", \\\"jag\\\", \\\"gjort\\\", \\\"men\\\", \\\"ska\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of ca. 150.000 words of text.\n", "dataset_name": "vesteinn/swe-nerc"}}, "tags": [], "is_gated": false}, "shunk031/jsnli": {"dataset_name": "shunk031/jsnli", "description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0", "downloads": 52, "configs": {"with-filtering": {"config_name": "with-filtering", "sample_row": "{\"premise\": \"\\\"\\\\u30ac\\\\u30ec\\\\u30fc\\\\u30b8 \\\\u3067 \\\\u3001 \\\\u58c1 \\\\u3...\", \"hypothesis\": \"\\\"\\\\u7537 \\\\u306f \\\\u9b54\\\\u6cd5 \\\\u306e \\\\u30b7\\\\u30e7\\\\u3...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\n", "dataset_name": "shunk031/jsnli"}, "without-filtering": {"config_name": "without-filtering", "sample_row": "{\"premise\": \"\\\"\\\\u30ac\\\\u30ec\\\\u30fc\\\\u30b8 \\\\u3067 \\\\u3001 \\\\u58c1 \\\\u3...\", \"hypothesis\": \"\\\"\\\\u7537 \\\\u306f \\\\u9b54\\\\u6cd5 \\\\u306e \\\\u30b7\\\\u30e7\\\\u3...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\n", "dataset_name": "shunk031/jsnli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "multilinguality:monolingual", "language:ja", "natural-language-inference", "nli", "jsnli"], "is_gated": false}, "its5Q/yandex-q": {"dataset_name": "its5Q/yandex-q", "description": "This is a dataset of questions and answers scraped from Yandex.Q.", "downloads": 43, "configs": {"default": {"config_name": "default", "sample_row": "{\"description\": \"\\\"\\\"\", \"question\": \"\\\"\\\\u041a\\\\u0430\\\\u043a \\\\u0432\\\\u043e\\\\u0439\\\\u0442\\\\u0438...\", \"answer\": \"\\\"\\\\u041d\\\\u0438\\\\u043a\\\\u0430\\\\u043a \\\\u043d\\\\u043e \\\\u043...\"}", "columns": ["description", "question", "answer"], "columns_mapping": {"description": "description", "question": "question", "answer": "answer"}, "dataset_description": "This is a dataset of questions and answers scraped from Yandex.Q.\n", "dataset_name": "its5Q/yandex-q"}}, "tags": ["task_categories:text-generation", "task_categories:question-answering", "task_ids:language-modeling", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru"], "is_gated": false}, "RobotsMaliAI/bayelemabaga": {"dataset_name": "RobotsMaliAI/bayelemabaga", "description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.", "downloads": 58, "configs": {"bam-fr": {"config_name": "bam-fr", "sample_row": "{\"translation.bam\": \"\\\"Mieru Baa ka maana. Ayiwa!\\\"\", \"translation.fr\": \"\\\"Le recit de Mieru Baa Eh bien! Fanta Maa.\\\"\"}", "columns": ["translation_bam", "translation_fr"], "columns_mapping": {"translation.bam": "translation_bam", "translation.fr": "translation_fr"}, "dataset_description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.\n", "dataset_name": "RobotsMaliAI/bayelemabaga"}, "fr-bam": {"config_name": "fr-bam", "sample_row": "{\"translation.fr\": \"\\\"Le recit de Mieru Baa Eh bien! Fanta Maa.\\\"\", \"translation.bam\": \"\\\"Mieru Baa ka maana. Ayiwa!\\\"\"}", "columns": ["translation_fr", "translation_bam"], "columns_mapping": {"translation.fr": "translation_fr", "translation.bam": "translation_bam"}, "dataset_description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.\n", "dataset_name": "RobotsMaliAI/bayelemabaga"}}, "tags": ["task_categories:translation", "task_categories:text-generation", "language:bm", "language:fr"], "is_gated": false}, "ipipan/nkjp1m": {"dataset_name": "ipipan/nkjp1m", "description": "This is the official dataset for NKJP1M \u2013 the 1-million token subcorpus of the\nNational Corpus of Polish (Narodowy Korpus J\u0119zyka Polskiego)\n\nBesides the text (divided into paragraphs/samples and sentences) the\nset contains lemmas and morpho-syntactic tags for all tokens in the corpus.\n\nThis release corresponds to the version 1.2 of the corpus with\nfollowing corrections and improvements. In particular the\nmorpho-syntactic annotation has been aligned with the present version\nof Morfeusz2 morphological analyser.", "downloads": 14, "configs": {"nkjp1m": {"config_name": "nkjp1m", "sample_row": "{\"nkjp_text\": \"\\\"NKJP_1M_0102000000001\\\"\", \"nkjp_par\": \"\\\"morph_1-p\\\"\", \"nkjp_sent\": \"\\\"morph_1.57-s\\\"\", \"tokens\": \"[\\\"Zatrzasn\\\\u0105\\\\u0142\\\", \\\"drzwi\\\", \\\"od\\\", \\\"mieszkani...\", \"lemmas\": \"[\\\"zatrzasn\\\\u0105\\\\u0107\\\", \\\"drzwi\\\", \\\"od\\\", \\\"mieszkani...\", \"cposes\": \"[11, 6, 9, 6, 10, 7, 6, 11, 6, 10, 11, 6, 10, 2, 1...\", \"poses\": \"[30, 35, 32, 35, 19, 20, 35, 30, 35, 19, 30, 35, 1...\", \"tags\": \"[869, 910, 888, 975, 266, 277, 907, 869, 961, 266,...\", \"nps\": \"[false, false, false, false, true, false, false, f...\", \"nkjp_ids\": \"[\\\"morph_1.1-seg\\\", \\\"morph_1.2-seg\\\", \\\"morph_1.3-seg\\\"...\"}", "columns": ["nkjp_text", "nkjp_par", "nkjp_sent", "tokens", "lemmas", "cposes", "poses", "tags", "nps", "nkjp_ids"], "columns_mapping": {"nkjp_text": "nkjp_text", "nkjp_par": "nkjp_par", "nkjp_sent": "nkjp_sent", "tokens": "tokens", "lemmas": "lemmas", "cposes": "cposes", "poses": "poses", "tags": "tags", "nps": "nps", "nkjp_ids": "nkjp_ids"}, "dataset_description": "This is the official dataset for NKJP1M \u2013 the 1-million token subcorpus of the\nNational Corpus of Polish (Narodowy Korpus J\u0119zyka Polskiego)\n\nBesides the text (divided into paragraphs/samples and sentences) the\nset contains lemmas and morpho-syntactic tags for all tokens in the corpus.\n\nThis release corresponds to the version 1.2 of the corpus with\nfollowing corrections and improvements. In particular the\nmorpho-syntactic annotation has been aligned with the present version\nof Morfeusz2 morphological analyser.\n\n", "dataset_name": "ipipan/nkjp1m"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "task_ids:lemmatization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl", "National Corpus of Polish", "Narodowy Korpus J\u0119zyka Polskiego"], "is_gated": false}, "masakhane/masakhaner2": {"dataset_name": "masakhane/masakhaner2", "description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "downloads": 1145, "configs": {"bam": {"config_name": "bam", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Nin\\\", \\\"waati\\\", \\\"in\\\", \\\"na\\\", \\\",\\\", \\\"a\\\", \\\"ka\\\", \\\"g\\\\u0...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "bbj": {"config_name": "bbj", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Msa\\\\u02bcny\\\\u0259\\\\u0302\\\", \\\"g\\\\u0254ti\\\\u0301\\\", \\\"cy...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "ewe": {"config_name": "ewe", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Le\\\", \\\"kwasi\\\\u0256a\\\", \\\"si\\\", \\\"va\\\", \\\"yi\\\", \\\"me\\\", \\\"la...\", \"ner_tags\": \"[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "fon": {"config_name": "fon", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Dot\\\\u00f3oxw\\\\u00e9\\\", \\\"\\\\u0254\\\\u0301\\\", \\\"\\\\u0256\\\\u00...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "hau": {"config_name": "hau", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Olurukoba\\\", \\\"ya\\\", \\\"ce\\\", \\\"hukumar\\\", \\\"ta\\\", \\\"kwasta...\", \"ner_tags\": \"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "ibo": {"config_name": "ibo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u1ecc\\\", \\\"d\\\\u1ecb\\\", \\\"\\\\u1ecdt\\\\u1ee5t\\\\u1ee5\\\", \\\"ihe...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "kin": {"config_name": "kin", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Miss\\\", \\\"Nimwiza\\\", \\\"yavuze\\\", \\\"ko\\\", \\\"hari\\\", \\\"imish...\", \"ner_tags\": \"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Obude\\\", \\\"bwali\\\", \\\"bunnyogovu\\\", \\\"nnyo\\\", \\\"we\\\", \\\"tw...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "luo": {"config_name": "luo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Nyoriwoni\\\", \\\"ne\\\", \\\"oketo\\\", \\\"apisgi\\\", \\\"maduong'\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "mos": {"config_name": "mos", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"R\\\\u1ebd\\\", \\\"f\\\\u00e3a\\\", \\\"ne\\\", \\\"no\\\", \\\"-\\\", \\\"r\\\\u0269k...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "nya": {"config_name": "nya", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ukwati\\\", \\\"ndiye\\\", \\\"adamanga\\\", \\\"pa\\\", \\\"4\\\", \\\"Octobe...\", \"ner_tags\": \"[0, 0, 0, 0, 7, 8, 8, 0, 0, 3, 4, 4, 0, 5, 0, 0]...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "pcm": {"config_name": "pcm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Di\\\", \\\"man\\\", \\\"go\\\", \\\"Twitter\\\", \\\"go\\\", \\\"reveal\\\", \\\"hi...\", \"ner_tags\": \"[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "sna": {"config_name": "sna", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mumatunhu\\\", \\\"mapfumbamwe\\\", \\\"akavhiringwa\\\", \\\"neCy...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "swa": {"config_name": "swa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Maafisa\\\", \\\"wa\\\", \\\"serikali\\\", \\\"ya\\\", \\\"Yemen\\\", \\\"wame...\", \"ner_tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "tsn": {"config_name": "tsn", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"E\\\", \\\"ne\\\", \\\"e\\\", \\\"le\\\", \\\"motlotli\\\", \\\"wa\\\", \\\"dikgang\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "twi": {"config_name": "twi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mmom\\\", \\\"obi\\\", \\\"a\\\", \\\"\\\\u0254w\\\\u0254\\\", \\\"ahobr\\\\u025b...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "wol": {"config_name": "wol", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u00d1\\\\u00ebwoon\\\", \\\"teewee\\\", \\\"daraap\\\\u00f3o\\\", \\\"n...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "xho": {"config_name": "xho", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Konakala\\\", \\\"izinto\\\", \\\"emsebenzini\\\", \\\"emva\\\", \\\"kok...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "yor": {"config_name": "yor", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ba\\\\u0300ba\\\\u0301\\\", \\\"to\\\\u0301\\\", \\\"bi\\\\u0301\\\", \\\"Ba\\\\u...\", \"ner_tags\": \"[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "zul": {"config_name": "zul", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ngesizini\\\", \\\"edlule\\\", \\\"baphelela\\\", \\\"endaweni\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:bm", "language:bbj", "language:ee", "language:fon", "language:ha", "language:ig", "language:rw", "language:lg", "language:luo", "language:mos", "language:ny", "language:pcm", "language:sn", "language:sw", "language:tn", "language:tw", "language:wo", "language:xh", "language:yo", "language:zu", "ner", "masakhaner", "masakhane"], "is_gated": false}, "ipipan/polqa": {"dataset_name": "ipipan/polqa", "description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.", "downloads": 79, "configs": {"pairs": {"config_name": "pairs", "sample_row": "{\"question_id\": \"1\", \"passage_title\": \"\\\"Alfa\\\"\", \"passage_text\": \"\\\"Alfa (\\\\u1f04\\\\u03bb\\\\u03c6\\\\u03b1, pisana \\\\u0391\\\\u03...\", \"passage_wiki\": \"\\\"Alfa (\\\\u1f04\\\\u03bb\\\\u03c6\\\\u03b1, pisana \\\\u0391\\\\u03...\", \"passage_id\": \"\\\"19291-0\\\"\", \"duplicate\": \"true\", \"question\": \"\\\"Jak nazywa si\\\\u0119 pierwsza litera alfabetu grec...\", \"relevant\": \"true\", \"annotated_by\": \"\\\"Igor\\\"\", \"answers\": \"\\\"['alfa']\\\"\", \"question_formulation\": \"\\\"QUESTION\\\"\", \"question_type\": \"\\\"SINGLE ENTITY\\\"\", \"entity_type\": \"\\\"UNNAMED\\\"\", \"entity_subtype\": \"\\\"-\\\"\", \"split\": \"\\\"train\\\"\", \"passage_source\": \"\\\"zero-shot\\\"\"}", "columns": ["question_id", "passage_title", "passage_text", "passage_wiki", "passage_id", "duplicate", "question", "relevant", "annotated_by", "answers", "question_formulation", "question_type", "entity_type", "entity_subtype", "split", "passage_source"], "columns_mapping": {"question_id": "question_id", "passage_title": "passage_title", "passage_text": "passage_text", "passage_wiki": "passage_wiki", "passage_id": "passage_id", "duplicate": "duplicate", "question": "question", "relevant": "relevant", "annotated_by": "annotated_by", "answers": "answers", "question_formulation": "question_formulation", "question_type": "question_type", "entity_type": "entity_type", "entity_subtype": "entity_subtype", "split": "split", "passage_source": "passage_source"}, "dataset_description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.\n", "dataset_name": "ipipan/polqa"}, "passages": {"config_name": "passages", "sample_row": "{\"id\": \"\\\"2-0\\\"\", \"title\": \"\\\"AWK\\\"\", \"text\": \"\\\"AWK \\\\u2013 interpretowany j\\\\u0119zyk programowani...\"}", "columns": ["id", "title", "text"], "columns_mapping": {"id": "id", "title": "title", "text": "text"}, "dataset_description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.\n", "dataset_name": "ipipan/polqa"}}, "tags": ["task_categories:question-answering", "task_categories:text-retrieval", "task_categories:text2text-generation", "task_ids:open-domain-qa", "task_ids:document-retrieval", "task_ids:abstractive-qa", "annotations_creators:expert-generated", "language:pl"], "is_gated": false}, "orai-nlp/basqueGLUE": {"dataset_name": "orai-nlp/basqueGLUE", "description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.", "downloads": 55, "configs": {"bec": {"config_name": "bec", "sample_row": "{\"text\": \"\\\"Retweeted EH Bildu Bizkaia (@ehbildubizkaia):\\\\\\\\n\\\\...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "bhtc": {"config_name": "bhtc", "sample_row": "{\"text\": \"\\\"Diru-Sarrerak Bermatzeko Errenta (DSBE, gaztelera...\", \"label\": \"3\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "coref": {"config_name": "coref", "sample_row": "{\"text\": \"\\\"HIROMU NONAKA Japonian Gobernuan dagoen LDP Alder...\", \"span1_text\": \"\\\"HIROMU NONAKA Japonian Gobernuan dagoen LDP Alder...\", \"span2_text\": \"\\\"oposizioak\\\"\", \"label\": \"0\", \"span1_index\": \"0\", \"span2_index\": \"36\", \"idx\": \"0\"}", "columns": ["text", "span1_text", "span2_text", "label", "span1_index", "span2_index", "idx"], "columns_mapping": {"text": "text", "span1_text": "span1_text", "span2_text": "span2_text", "label": "label", "span1_index": "span1_index", "span2_index": "span2_index", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "intent": {"config_name": "intent", "sample_row": "{\"text\": \"\\\"aldatu alarma 7am-tik 7pm-ra , mesedez\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "nerc_id": {"config_name": "nerc_id", "sample_row": "{\"tokens\": \"[\\\"Greba\\\", \\\"orokorrera\\\", \\\"deitu\\\", \\\"du\\\", \\\"EHk\\\", \\\"27r...\", \"tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "nerc_od": {"config_name": "nerc_od", "sample_row": "{\"tokens\": \"[\\\"Greba\\\", \\\"orokorrera\\\", \\\"deitu\\\", \\\"du\\\", \\\"EHk\\\", \\\"27r...\", \"tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "qnli": {"config_name": "qnli", "sample_row": "{\"question\": \"\\\"Orain zer ikertzen ari da?\\\"\", \"sentence\": \"\\\"Hedabide askotan kolaboratu du, gehienak parapsik...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["question", "sentence", "label", "idx"], "columns_mapping": {"question": "question", "sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "slot": {"config_name": "slot", "sample_row": "{\"tokens\": \"[\\\"aldatu\\\", \\\"alarma\\\", \\\"7am-tik\\\", \\\"7pm-ra\\\", \\\",\\\", \\\"me...\", \"tags\": \"[0, 0, 1, 12, 0, 0]\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "vaxx": {"config_name": "vaxx", "sample_row": "{\"text\": \"\\\"\\\\\\\"#COVID19 Oraingo datuak, izurriaren dinamika, t...\", \"label\": \"2\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "wic": {"config_name": "wic", "sample_row": "{\"sentence1\": \"\\\"Egun hauetan Atlantako zilarraz galdetu diogun ba...\", \"sentence2\": \"\\\"Lance Armstrong eta Jan Ullrich ziren guztian aho...\", \"word\": \"\\\"itxaropen\\\"\", \"label\": \"0\", \"start1\": \"149\", \"start2\": \"89\", \"end1\": \"159\", \"end2\": \"100\", \"idx\": \"0\"}", "columns": ["sentence1", "sentence2", "word", "label", "start1", "start2", "end1", "end2", "idx"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "word": "word", "label": "label", "start1": "start1", "start2": "start2", "end1": "end1", "end2": "end2", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}}, "tags": ["task_categories:text-classification", "task_categories:token-classification", "task_ids:intent-classification", "task_ids:natural-language-inference", "task_ids:sentiment-classification", "task_ids:topic-classification", "task_ids:named-entity-recognition", "task_ids:coreference-resolution", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:eu"], "is_gated": false}, "neulab/docprompting-conala": {"dataset_name": "neulab/docprompting-conala", "description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "downloads": 1014, "configs": {"data": {"config_name": "data", "sample_row": "{\"question_id\": \"\\\"348196-52\\\"\", \"nl\": \"\\\"Create list `instancelist` containing 29 objects ...\", \"cmd\": \"\\\"instancelist = [MyClass() for i in range(29)]\\\"\", \"oracle_man\": \"[\\\"python.library.functions#range\\\"]\", \"canonical_cmd\": \"\\\"VAR_STR = [MyClass() for i in range(29)]\\\"\", \"cmd_name\": \"\\\"conala\\\"\"}", "columns": ["question_id", "nl", "cmd", "oracle_man", "canonical_cmd", "cmd_name"], "columns_mapping": {"question_id": "question_id", "nl": "nl", "cmd": "cmd", "oracle_man": "oracle_man", "canonical_cmd": "canonical_cmd", "cmd_name": "cmd_name"}, "dataset_description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "dataset_name": "neulab/docprompting-conala"}, "docs": {"config_name": "docs", "sample_row": "{\"doc_id\": \"\\\"tensorflow.aggregationmethod\\\"\", \"doc_content\": \"\\\"tf.AggregationMethod View source on GitHub ...\"}", "columns": ["doc_id", "doc_content"], "columns_mapping": {"doc_id": "doc_id", "doc_content": "doc_content"}, "dataset_description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "dataset_name": "neulab/docprompting-conala"}}, "tags": ["task_categories:text2text-generation", "multilinguality:monolingual", "source_datasets:original", "language:code", "code-generation", "doc retrieval", "retrieval augmented generation"], "is_gated": false}, "aashsach/multiconer2": {"dataset_name": "aashsach/multiconer2", "description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition", "downloads": 37, "configs": {"bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u09b8\\\\u09cd\\\\u099f\\\\u09c7\\\\u09b6\\\\u09a8\\\\u099f\\\\u09bf...\", \"ner_tags\": \"[0, 41, 42, 42, 0, 0]\", \"ner_macro_tags\": \"[0, 9, 10, 10, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"das\\\", \\\"geb\\\\u00e4ude\\\", \\\"hatte\\\", \\\"bis\\\", \\\"1984\\\", \\\"e...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"robert\\\", \\\"gottschalk\\\", \\\"1939\\\", \\\"academy\\\", \\\"award...\", \"ner_tags\": \"[39, 40, 0, 63, 64, 0, 0, 0, 0, 35]\", \"ner_macro_tags\": \"[7, 8, 0, 3, 4, 0, 0, 0, 0, 5]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u00e9douard\\\", \\\"herriot\\\", \\\"ou\\\", \\\"la\\\", \\\"r\\\\u00e9pu...\", \"ner_tags\": \"[43, 44, 0, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[7, 8, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "fa": {"config_name": "fa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u06f2\\\\u06f0\\\\u06f1\\\\u06f0\\\", \\\"\\\\u060c\\\", \\\"\\\\u0633\\\\u06...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 25]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 1]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"elle\\\", \\\"porte\\\", \\\"le\\\", \\\"nom\\\", \\\"de\\\", \\\"la\\\", \\\"romanc...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u092f\\\\u0939\\\", \\\"\\\\u091d\\\\u093f\\\\u092f\\\\u093e\\\\u0928\\\",...\", \"ner_tags\": \"[0, 25, 25, 0, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[0, 1, 1, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "it": {"config_name": "it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"paesaggio\\\", \\\"con\\\", \\\"figura\\\", \\\"(\\\", \\\"1865\\\", \\\"1885\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 22, 22, 22, 0]...\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "pt": {"config_name": "pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"os\\\", \\\"moradores\\\", \\\"decidiram\\\", \\\"ent\\\\u00e3o\\\", \\\"do...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, ...\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "sv": {"config_name": "sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"nils\\\", \\\"perne\\\", \\\"svensk\\\", \\\"komposit\\\\u00f6r\\\", \\\"te...\", \"ner_tags\": \"[7, 8, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[7, 8, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "uk": {"config_name": "uk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u043b\\\\u044c\\\\u0432\\\\u0443\\\\u0432\\\\u0435\\\\u043a\\\", \\\"(\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 25, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5167\\\", \\\"\\\\u7a46\\\", \\\"\\\\u723e\\\", \\\"\\\\u00b7\\\", \\\"\\\\u54c8\\\",...\", \"ner_tags\": \"[43, 44, 44, 44, 44, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"ner_macro_tags\": \"[7, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}}, "tags": [], "is_gated": false}, "eloukas/edgar-corpus": {"dataset_name": "eloukas/edgar-corpus", "description": "The dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).", "downloads": 397, "configs": {"full": {"config_name": "full", "sample_row": "{\"filename\": \"\\\"92116_1993.txt\\\"\", \"cik\": \"\\\"92116\\\"\", \"year\": \"\\\"1993\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nSouthern California Wa...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2 - Properties\\\\nFranchises, Competition, Acq...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn October 20, 1993, t...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nInformation resp...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nInformation resp...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1993": {"config_name": "year_1993", "sample_row": "{\"filename\": \"\\\"92116_1993.txt\\\"\", \"cik\": \"\\\"92116\\\"\", \"year\": \"\\\"1993\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nSouthern California Wa...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2 - Properties\\\\nFranchises, Competition, Acq...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn October 20, 1993, t...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nInformation resp...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nInformation resp...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1994": {"config_name": "year_1994", "sample_row": "{\"filename\": \"\\\"814677_1994.txt\\\"\", \"cik\": \"\\\"814677\\\"\", \"year\": \"\\\"1994\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nA. Introduction\\\\n(i) Background...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nAt December 31, 1994, the Com...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThe Company is involve...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR THE COMPANY'S COMMON EQUITY AN...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nSUMMARY OF SELEC...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTAL DAT...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nITEM 12.\\\"\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIPS OF CERTAIN BENEFICIA...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1995": {"config_name": "year_1995", "sample_row": "{\"filename\": \"\\\"823195_1995.txt\\\"\", \"cik\": \"\\\"823195\\\"\", \"year\": \"\\\"1995\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Development of Business...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Partnership does not own ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nIn or about April 1993...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Limited Partnersh...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\n(dollars in thou...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe Partnership ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1996": {"config_name": "year_1996", "sample_row": "{\"filename\": \"\\\"319315_1996.txt\\\"\", \"cik\": \"\\\"319315\\\"\", \"year\": \"\\\"1996\\\"\", \"section_1\": \"\\\"ITEM 1 - Business\\\\nGeneral\\\\nThe response to this ...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2 - Properties\\\\nGeneral\\\\nThe Partnership's i...\", \"section_3\": \"\\\"ITEM 3 - Legal Proceedings\\\\nThere are no material...\", \"section_4\": \"\\\"ITEM 4 - Submission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"ITEM 5 - Market for the Registrant's Common Equit...\", \"section_6\": \"\\\"ITEM 6 - Selected Financial Data\\\\nSelected financ...\", \"section_7\": \"\\\"ITEM 7 - Management's Discussion and Analysis of ...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8 - Financial Statements and Supplementary D...\", \"section_9\": \"\\\"ITEM 9 - Changes in and Disagreements with Accoun...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10 - Directors and Executive Officers of the...\", \"section_11\": \"\\\"ITEM 11 - Executive Compensation\\\\nThe Partnership...\", \"section_12\": \"\\\"ITEM 12 - Security Ownership of Certain Beneficia...\", \"section_13\": \"\\\"ITEM 13 - Certain Relationships and Related Trans...\", \"section_14\": \"\\\"ITEM 14 - Financial Statements, Schedules, Exhibi...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1997": {"config_name": "year_1997", "sample_row": "{\"filename\": \"\\\"820736_1997.txt\\\"\", \"cik\": \"\\\"820736\\\"\", \"year\": \"\\\"1997\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nBACKGROUND\\\\nOrbital Sciences Co...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nOrbital owns or leases over 1...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nOn October 10, 1996, T...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe information ...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThe information ...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1998": {"config_name": "year_1998", "sample_row": "{\"filename\": \"\\\"887919_1998.txt\\\"\", \"cik\": \"\\\"887919\\\"\", \"year\": \"\\\"1998\\\"\", \"section_1\": \"\\\"ITEM 1. DESCRIPTION OF BUSINESS\\\\nTHE COMPANY\\\\nPre...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nThe Company owns 115 North Ha...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThe Banks are respecti...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe following ta...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1999": {"config_name": "year_1999", "sample_row": "{\"filename\": \"\\\"854864_1999.txt\\\"\", \"cik\": \"\\\"854864\\\"\", \"year\": \"\\\"1999\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Description of Partners...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Nonoperating Interests in Properties\\\\nAs ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe Partnership is not...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market Price of and Distributions on the ...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe following se...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Disagreements on Accounting and Financial...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nAs noted in Item...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2000": {"config_name": "year_2000", "sample_row": "{\"filename\": \"\\\"1064728_2000.txt\\\"\", \"cik\": \"\\\"1064728\\\"\", \"year\": \"\\\"2000\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS.\\\\nOVERVIEW\\\\nWe are the world's l...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES.\\\\nCOAL RESERVES\\\\nWe had an est...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS.\\\\nFrom time to time, we...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA.\\\\nP&L Coal Holdin...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION.\\\\nThe following t...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2001": {"config_name": "year_2001", "sample_row": "{\"filename\": \"\\\"18072_2001.htm\\\"\", \"cik\": \"\\\"18072\\\"\", \"year\": \"\\\"2001\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nCascade Natural Gas Co...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Properties\\\\nAt September 30, 2001, Cascad...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nIncorporated herein by...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nItem 7.\\\"\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nReference is mad...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2002": {"config_name": "year_2002", "sample_row": "{\"filename\": \"\\\"1121980_2002.htm\\\"\", \"cik\": \"\\\"1121980\\\"\", \"year\": \"\\\"2002\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nOVERVIEW\\\\nHPL Technologies, Inc...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nThe following table sets for ...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nNone.\\\\nITEM 4.\\\"\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR COMMON STOCK AND RELATED STOCK...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe selected con...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThere is incorpo...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2003": {"config_name": "year_2003", "sample_row": "{\"filename\": \"\\\"1224874_2003.txt\\\"\", \"cik\": \"\\\"1224874\\\"\", \"year\": \"\\\"2003\\\"\", \"section_1\": \"\\\"ITEM 1. Business.\\\\nNot Applicable\\\\nITEM 2.\\\"\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. Properties.\\\\nNot Applicable\\\\nITEM 3.\\\"\", \"section_3\": \"\\\"ITEM 3. Legal Proceedings.\\\\nNone.\\\\nITEM 4.\\\"\", \"section_4\": \"\\\"ITEM 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"ITEM 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"ITEM 6. Selected Financial Data.\\\\nNot Applicable\\\\...\", \"section_7\": \"\\\"ITEM 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"ITEM 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"ITEM 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"ITEM 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"ITEM 9A. Controls and Procedures\\\\nNot Applicable\\\\...\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. Directors and Executive Officers of Regi...\", \"section_11\": \"\\\"ITEM 11. Executive Compensation.\\\\nNot Applicable....\", \"section_12\": \"\\\"ITEM 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"ITEM 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"ITEM 14. Principal Accountant Fees and Services N...\", \"section_15\": \"\\\"ITEM 15. Exhibits, Financial Statement Schedules,...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2004": {"config_name": "year_2004", "sample_row": "{\"filename\": \"\\\"1287258_2004.htm\\\"\", \"cik\": \"\\\"1287258\\\"\", \"year\": \"\\\"2004\\\"\", \"section_1\": \"\\\"Item 1.\\\\nBusiness\\\\nCompany Overview\\\\nWe are a lea...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2.\\\\nProperties\\\\nWe are headquartered in San ...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings\\\\nIn November 2000, a f...\", \"section_4\": \"\\\"Item 4.\\\\nSubmission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"Item 5.\\\\nMarket for Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6.\\\\nSelected Financial Data\\\\nThe selected fi...\", \"section_7\": \"\\\"Item 7.\\\\nManagement\\\\u2019s Discussion and Analysi...\", \"section_7A\": \"\\\"Item 7A.\\\\nQuantitative and Qualitative Disclosure...\", \"section_8\": \"\\\"Item 8.\\\\nFinancial Statements and Supplementary D...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControl and Procedures\\\\nWe maintain dis...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors and Executive Officers of the...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation\\\\nIncorporated by...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accounting Fees and Services\\\\...\", \"section_15\": \"\\\"Item 15.\\\\nExhibits and Financial Statement Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2005": {"config_name": "year_2005", "sample_row": "{\"filename\": \"\\\"1319633_2005.txt\\\"\", \"cik\": \"\\\"1319633\\\"\", \"year\": \"\\\"2005\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nNot applicable. See the Relief...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nNot applicable.\\\\nItem 1B....\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNone.\\\\nItem ...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nNot applicable. See the Reli...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nThere were no materia...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity, Re...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nNot applicable....\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A Quantitative and Qualitative Disclosures ...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nNot applicable...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nNot applicable....\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2006": {"config_name": "year_2006", "sample_row": "{\"filename\": \"\\\"1351893_2006.txt\\\"\", \"cik\": \"\\\"1351893\\\"\", \"year\": \"\\\"2006\\\"\", \"section_1\": \"\\\"\\\"\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"\\\"\", \"section_3\": \"\\\"\\\"\", \"section_4\": \"\\\"\\\"\", \"section_5\": \"\\\"\\\"\", \"section_6\": \"\\\"\\\"\", \"section_7\": \"\\\"\\\"\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"\\\"\", \"section_9\": \"\\\"\\\"\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION.\\\\nNone.\\\\nPART IV\\\\nITEM...\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND FINANCIAL STATEMENT SCHEDUL...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2007": {"config_name": "year_2007", "sample_row": "{\"filename\": \"\\\"1178336_2007.htm\\\"\", \"cik\": \"\\\"1178336\\\"\", \"year\": \"\\\"2007\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nForward-Looking Statements\\\\nThi...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nRisks Related to Our Busin...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nWe own approximately 2.2 acre...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nFrom time to time, we ...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe following se...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Qualitative and Quantitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nEvaluation of D...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nReference is mad...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statements Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2008": {"config_name": "year_2008", "sample_row": "{\"filename\": \"\\\"3906_2008.htm\\\"\", \"cik\": \"\\\"3906\\\"\", \"year\": \"\\\"2008\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nGeneral\\\\nWe are a business dev...\", \"section_1A\": \"\\\"Item 1A.\\\\nRisk Factors.\\\\nInvesting in Allied Capi...\", \"section_1B\": \"\\\"Item 1B.\\\\nUnresolved Staff Comments\\\\nNot applicab...\", \"section_2\": \"\\\"Item 2.\\\\nProperties.\\\\nOur principal offices are l...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings.\\\\nOn June 23, 2004, we...\", \"section_4\": \"\\\"Item 4.\\\\nSubmission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"Item 5.\\\\nMarket For Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nSELECTED CONDEN...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosure ...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControls and Procedures.\\\\n(a) Evaluatio...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information.\\\\nOn February 26, 200...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors, Executive Officers and Corpo...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation.\\\\nInformation in...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accountant Fees and Services....\", \"section_15\": \"\\\"Item 15.\\\\nExhibits and Financial Statement Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2009": {"config_name": "year_2009", "sample_row": "{\"filename\": \"\\\"907654_2009.htm\\\"\", \"cik\": \"\\\"907654\\\"\", \"year\": \"\\\"2009\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nSome of the statements under \\\\u...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nRisks Related to Our Busin...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNot applicabl...\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Company\\\\u2019s headquarte...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn February 9, 2007, N...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nNot applicable.\\\\...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A(T). Controls and Procedures\\\\nEvaluation o...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe response to ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2010": {"config_name": "year_2010", "sample_row": "{\"filename\": \"\\\"1164888_2010.htm\\\"\", \"cik\": \"\\\"1164888\\\"\", \"year\": \"\\\"2010\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\n(A) BUSINESS DEVELOPMENT\\\\nKyto ...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. DESCRIPTION OF PROPERTY\\\\nThe Company occu...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThere is no litigation...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR COMMON EQUITY AND RELATED STOC...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nEarnings per sha...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nDisclosure Cont...\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS, PROMOT...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\n(A) SUMMARY COMP...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTANT FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND REPORTS ON FORM 8-K\\\\n(A) LI...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2011": {"config_name": "year_2011", "sample_row": "{\"filename\": \"\\\"1297341_2011.htm\\\"\", \"cik\": \"\\\"1297341\\\"\", \"year\": \"\\\"2011\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nForward-looking Statements\\\\nThi...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nWe may not be able to achi...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nWe conduct our business throu...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nFrom time to time, we ...\", \"section_4\": \"\\\"Item 4. [Removed and reserved]\\\\nNot applicable.\\\\n...\", \"section_5\": \"\\\"Item 5. Market for the Registrant\\\\u2019s Common E...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThis item is not...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nManagement is r...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNot Applicable.\\\\nPART...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers, and Corpo...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe information ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2012": {"config_name": "year_2012", "sample_row": "{\"filename\": \"\\\"1121901_2012.htm\\\"\", \"cik\": \"\\\"1121901\\\"\", \"year\": \"\\\"2012\\\"\", \"section_1\": \"\\\"Item 1.\\\\nBusiness.\\\\nCompany Overview\\\\nInternation...\", \"section_1A\": \"\\\"Item 1A.\\\\nRisk Factors.\\\\nNot applicable.\\\\nItem 1B...\", \"section_1B\": \"\\\"Item 1B.\\\\nUnresolved Staff Comments.\\\\nNone.\\\\nItem...\", \"section_2\": \"\\\"Item 2.\\\\nProperties.\\\\nOur corporate office is loc...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings.\\\\nWe are aware of the ...\", \"section_4\": \"\\\"Item 4.\\\\n(Removed and Reserved).\\\\nNot applicable....\", \"section_5\": \"\\\"Item 5.\\\\nMarket for Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6.\\\\nSelected Financial Data.\\\\nNot applicable...\", \"section_7\": \"\\\"Item 7.\\\\nManagement\\\\u2019s Discussion and Analysi...\", \"section_7A\": \"\\\"Item 7A.\\\\nQuantitative and Qualitative Disclosure...\", \"section_8\": \"\\\"Item 8.\\\\nFinancial Statements and Supplementary D...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControls and Procedures.\\\\nSee Item 9A(T...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information.\\\\nNone.\\\\nPART III\\\\nIt...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors, Executive Officers and Corpo...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation.\\\\nSummary of Cas...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accounting Fees and Services....\", \"section_15\": \"\\\"Item 15.\\\\nExhibits, Financial Statement Schedules...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2013": {"config_name": "year_2013", "sample_row": "{\"filename\": \"\\\"875657_2013.htm\\\"\", \"cik\": \"\\\"875657\\\"\", \"year\": \"\\\"2013\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nGeneral\\\\nWe offer products and ...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nOur business faces many ri...\", \"section_1B\": \"\\\"ITEM 1B. UNRESOLVED STAFF COMMENTS\\\\nNone.\\\\nITEM 2...\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nAs of December 31, 2013, we o...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nWe are subject to lega...\", \"section_4\": \"\\\"ITEM 4. MINE SAFETY DISCLOSURES\\\\nNot applicable.\\\\...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT\\\\u2019S COMMON EQUIT...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nAs a smaller rep...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT\\\\u2019S DISCUSSION AND ANALYSIS...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nEvaluation Of D...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nPART III\\\\nThe ...\", \"section_10\": \"\\\"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPOR...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThe sections ent...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTANT FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDULES\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2014": {"config_name": "year_2014", "sample_row": "{\"filename\": \"\\\"884887_2014.htm\\\"\", \"cik\": \"\\\"884887\\\"\", \"year\": \"\\\"2014\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nGeneral\\\\nWe are the world\\\\u201...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nThe risk factors set forth...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nInformation about our cruise ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nA class action complai...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNone.\\\\nPART II\\\\n...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity, Re...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe selected con...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes In and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nEvaluation of D...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2015": {"config_name": "year_2015", "sample_row": "{\"filename\": \"\\\"874841_2015.htm\\\"\", \"cik\": \"\\\"874841\\\"\", \"year\": \"\\\"2015\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nGeneral\\\\nPacific Sunwear of Cal...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nCautionary Note Regarding ...\", \"section_1B\": \"\\\"ITEM 1B. UNRESOLVED STAFF COMMENTS\\\\nNone.\\\\nITEM 2...\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nWe operate stores in each of ...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nCharles Pfeiffer, indi...\", \"section_4\": \"\\\"ITEM 4. MINE SAFETY DISCLOSURES\\\\nNot applicable.\\\\...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT\\\\u2019S COMMON EQUIT...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe following ta...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT\\\\u2019S DISCUSSION AND ANALYSIS...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nConclusion Rega...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nPART III\\\\nITEM...\", \"section_10\": \"\\\"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPOR...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nInformation with...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND FINANCIAL STATEMENT SCHEDUL...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2016": {"config_name": "year_2016", "sample_row": "{\"filename\": \"\\\"1306035_2016.htm\\\"\", \"cik\": \"\\\"1306035\\\"\", \"year\": \"\\\"2016\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nBackground Information\\\\nThe Com...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nBefore you invest in our c...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Company currently maintai...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe Company is not a p...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY, RE...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nNot Applicable.\\\\...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITIATIVE DISCLOSURE...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nEvaluation of D...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nREPORT OF INDE...\", \"section_10\": \"\\\"ITEM 10.01 DEPARTURE OF DIRECTORS OR PRINCIPAL OF...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nNo annual and lo...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED PARTY ...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDULES\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2017": {"config_name": "year_2017", "sample_row": "{\"filename\": \"\\\"1595248_2017.htm\\\"\", \"cik\": \"\\\"1595248\\\"\", \"year\": \"\\\"2017\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nOverview\\\\nGenprex\\\\u2122 is a c...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nInvesting in our common s...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNone.\\\\nItem ...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nOur corporate and executive ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nWe are not subject to...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures.\\\\nNone.\\\\nPART II\\\\...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nThe following s...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nEvaluation of ...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nOur named execu...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules....\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2018": {"config_name": "year_2018", "sample_row": "{\"filename\": \"\\\"1566373_2018.htm\\\"\", \"cik\": \"\\\"1566373\\\"\", \"year\": \"\\\"2018\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nOverview\\\\nWe are a clinical-sta...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nInvesting in our common s...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNot applicab...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nOur current operations are b...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nFrom time to time, we...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures.\\\\nNot applicable....\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nThe following s...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nEvaluation of ...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nThe response to...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules....\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2019": {"config_name": "year_2019", "sample_row": "{\"filename\": \"\\\"88121_2019.htm\\\"\", \"cik\": \"\\\"88121\\\"\", \"year\": \"\\\"2019\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Development of Business...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nSeaboard has identified im...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nManagement believes that Seab...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe information requir...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNot Applicable.\\\\...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\n(a)Total assets ...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nAs of December ...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe information ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2020": {"config_name": "year_2020", "sample_row": "{\"filename\": \"\\\"718413_2020.htm\\\"\", \"cik\": \"\\\"718413\\\"\", \"year\": \"\\\"2020\\\"\", \"section_1\": \"\\\"Item 1. The Business\\\\nOrganization and Operation\\\\...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nBefore deciding to invest ...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNot Applicabl...\", \"section_2\": \"\\\"Item 2. Properties\\\\nAlthough the Company does not...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThere are no pending l...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNot Applicable\\\\n...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nOmitted, in acco...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nDisclosure Cont...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone\\\\nPART III.\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe following is...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "research papers", "edgar", "sec", "finance", "financial", "filings", "10K", "10-K", "nlp", "research", "econlp", "economics", "business"], "is_gated": false}, "tasksource/babi_nli": {"dataset_name": "tasksource/babi_nli", "description": "bAbi tasks recasted as natural language inference.", "downloads": 139, "configs": {"single-supporting-fact": {"config_name": "single-supporting-fact", "sample_row": "{\"premise\": \"\\\"John travelled to the bathroom. Sandra moved to t...\", \"hypothesis\": \"\\\"Sandra is in the hallway.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "two-supporting-facts": {"config_name": "two-supporting-facts", "sample_row": "{\"premise\": \"\\\"Mary picked up the apple there. John took the mil...\", \"hypothesis\": \"\\\"The apple is in the garden.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "three-supporting-facts": {"config_name": "three-supporting-facts", "sample_row": "{\"premise\": \"\\\"Mary grabbed the apple. Mary discarded the apple....\", \"hypothesis\": \"\\\"The football before the bedroom was in the hallwa...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "two-arg-relations": {"config_name": "two-arg-relations", "sample_row": "{\"premise\": \"\\\"The kitchen is east of the office. The kitchen is...\", \"hypothesis\": \"\\\"The office west of is kitchen.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "three-arg-relations": {"config_name": "three-arg-relations", "sample_row": "{\"premise\": \"\\\"Bill picked up the milk there. Bill dropped the m...\", \"hypothesis\": \"\\\"Bill received the football.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "yes-no-questions": {"config_name": "yes-no-questions", "sample_row": "{\"premise\": \"\\\"Mary moved to the bathroom. Sandra journeyed to t...\", \"hypothesis\": \"\\\"Sandra is in the hallway.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "counting": {"config_name": "counting", "sample_row": "{\"premise\": \"\\\"Daniel moved to the bathroom. John moved to the k...\", \"hypothesis\": \"\\\"There is one objects is Sandra carrying.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "lists-sets": {"config_name": "lists-sets", "sample_row": "{\"premise\": \"\\\"Daniel grabbed the apple there. Daniel travelled ...\", \"hypothesis\": \"\\\"Daniel is carrying apple.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "simple-negation": {"config_name": "simple-negation", "sample_row": "{\"premise\": \"\\\"Mary is no longer in the bedroom. Daniel moved to...\", \"hypothesis\": \"\\\"Mary is in the bedroom.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "indefinite-knowledge": {"config_name": "indefinite-knowledge", "sample_row": "{\"premise\": \"\\\"Fred is either in the school or the park. Mary we...\", \"hypothesis\": \"\\\"Mary is in the office.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-coreference": {"config_name": "basic-coreference", "sample_row": "{\"premise\": \"\\\"Mary travelled to the bedroom. Afterwards she jou...\", \"hypothesis\": \"\\\"Mary is in the bathroom.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "conjunction": {"config_name": "conjunction", "sample_row": "{\"premise\": \"\\\"Mary and John moved to the bedroom. Daniel and Jo...\", \"hypothesis\": \"\\\"John is in the garden.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "compound-coreference": {"config_name": "compound-coreference", "sample_row": "{\"premise\": \"\\\"John and Daniel went to the office. After that th...\", \"hypothesis\": \"\\\"John is in the kitchen.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "time-reasoning": {"config_name": "time-reasoning", "sample_row": "{\"premise\": \"\\\"Mary journeyed to the kitchen this morning. Mary ...\", \"hypothesis\": \"\\\"Mary before the kitchen was in the school.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-deduction": {"config_name": "basic-deduction", "sample_row": "{\"premise\": \"\\\"Sheep are afraid of cats. Mice are afraid of cats...\", \"hypothesis\": \"\\\"Emily is afraid of cat.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-induction": {"config_name": "basic-induction", "sample_row": "{\"premise\": \"\\\"Lily is a rhino. Lily is white. Bernhard is a swa...\", \"hypothesis\": \"\\\"Greg is yellow.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "positional-reasoning": {"config_name": "positional-reasoning", "sample_row": "{\"premise\": \"\\\"The triangle is above the pink rectangle. The blu...\", \"hypothesis\": \"\\\"The pink rectangle is to the right of the blue sq...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "size-reasoning": {"config_name": "size-reasoning", "sample_row": "{\"premise\": \"\\\"The box of chocolates fits inside the chest. The ...\", \"hypothesis\": \"\\\"The box fit in the box of chocolates.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "path-finding": {"config_name": "path-finding", "sample_row": "{\"premise\": \"\\\"The garden is west of the office. The bedroom is ...\", \"hypothesis\": \"\\\"You go from the kitchen to the garden by heading ...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "agents-motivations": {"config_name": "agents-motivations", "sample_row": "{\"premise\": \"\\\"Sumit is bored. Jason is bored. Yann is thirsty. ...\", \"hypothesis\": \"\\\"Sumit went to the garden because she was bored.\\\"...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "logical reasoning", "nli", "natural-language-inference", "reasoning", "logic"], "is_gated": false}, "JanosAudran/financial-reports-sec": {"dataset_name": "JanosAudran/financial-reports-sec", "description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.", "downloads": 227, "configs": {"large_lite": {"config_name": "large_lite", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "docID", "sentenceID", "sentenceCount"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "large_full": {"config_name": "large_full", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"name\": \"\\\"AAR CORP\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\", \"tickers\": \"[\\\"AIR\\\"]\", \"exchanges\": \"[\\\"NYSE\\\"]\", \"entityType\": \"\\\"operating\\\"\", \"sic\": \"\\\"3720\\\"\", \"stateOfIncorporation\": \"\\\"DE\\\"\", \"tickerCount\": \"1\", \"acceptanceDateTime\": \"\\\"2020-07-21T17:19:15.000Z\\\"\", \"form\": \"\\\"10-K\\\"\", \"reportDate\": \"\\\"2020-05-31\\\"\", \"returns.1d.closePriceEndDate\": \"19.0100002289\", \"returns.1d.closePriceStartDate\": \"18.1900005341\", \"returns.1d.endDate\": \"\\\"2020-07-22T00:00:00-04:00\\\"\", \"returns.1d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.1d.ret\": \"0.045079696\", \"returns.5d.closePriceEndDate\": \"17.7199993134\", \"returns.5d.closePriceStartDate\": \"18.1900005341\", \"returns.5d.endDate\": \"\\\"2020-07-27T00:00:00-04:00\\\"\", \"returns.5d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.5d.ret\": \"-0.0258384391\", \"returns.30d.closePriceEndDate\": \"19.25\", \"returns.30d.closePriceStartDate\": \"18.1900005341\", \"returns.30d.endDate\": \"\\\"2020-08-20T00:00:00-04:00\\\"\", \"returns.30d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.30d.ret\": \"0.0582737457\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "dataset_name", "docID", "sentenceID", "sentenceCount", "tickers", "exchanges", "entityType", "sic", "stateOfIncorporation", "tickerCount", "acceptanceDateTime", "form", "reportDate", "returns_1d_closePriceEndDate", "returns_1d_closePriceStartDate", "returns_1d_endDate", "returns_1d_startDate", "returns_1d_ret", "returns_5d_closePriceEndDate", "returns_5d_closePriceStartDate", "returns_5d_endDate", "returns_5d_startDate", "returns_5d_ret", "returns_30d_closePriceEndDate", "returns_30d_closePriceStartDate", "returns_30d_endDate", "returns_30d_startDate", "returns_30d_ret"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "dataset_name": "dataset_name", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount", "tickers": "tickers", "exchanges": "exchanges", "entityType": "entityType", "sic": "sic", "stateOfIncorporation": "stateOfIncorporation", "tickerCount": "tickerCount", "acceptanceDateTime": "acceptanceDateTime", "form": "form", "reportDate": "reportDate", "returns.1d.closePriceEndDate": "returns_1d_closePriceEndDate", "returns.1d.closePriceStartDate": "returns_1d_closePriceStartDate", "returns.1d.endDate": "returns_1d_endDate", "returns.1d.startDate": "returns_1d_startDate", "returns.1d.ret": "returns_1d_ret", "returns.5d.closePriceEndDate": "returns_5d_closePriceEndDate", "returns.5d.closePriceStartDate": "returns_5d_closePriceStartDate", "returns.5d.endDate": "returns_5d_endDate", "returns.5d.startDate": "returns_5d_startDate", "returns.5d.ret": "returns_5d_ret", "returns.30d.closePriceEndDate": "returns_30d_closePriceEndDate", "returns.30d.closePriceStartDate": "returns_30d_closePriceStartDate", "returns.30d.endDate": "returns_30d_endDate", "returns.30d.startDate": "returns_30d_startDate", "returns.30d.ret": "returns_30d_ret"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "small_lite": {"config_name": "small_lite", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "docID", "sentenceID", "sentenceCount"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "small_full": {"config_name": "small_full", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"name\": \"\\\"AAR CORP\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\", \"tickers\": \"[\\\"AIR\\\"]\", \"exchanges\": \"[\\\"NYSE\\\"]\", \"entityType\": \"\\\"operating\\\"\", \"sic\": \"\\\"3720\\\"\", \"stateOfIncorporation\": \"\\\"DE\\\"\", \"tickerCount\": \"1\", \"acceptanceDateTime\": \"\\\"2020-07-21T17:19:15.000Z\\\"\", \"form\": \"\\\"10-K\\\"\", \"reportDate\": \"\\\"2020-05-31\\\"\", \"returns.1d.closePriceEndDate\": \"19.0100002289\", \"returns.1d.closePriceStartDate\": \"18.1900005341\", \"returns.1d.endDate\": \"\\\"2020-07-22T00:00:00-04:00\\\"\", \"returns.1d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.1d.ret\": \"0.045079696\", \"returns.5d.closePriceEndDate\": \"17.7199993134\", \"returns.5d.closePriceStartDate\": \"18.1900005341\", \"returns.5d.endDate\": \"\\\"2020-07-27T00:00:00-04:00\\\"\", \"returns.5d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.5d.ret\": \"-0.0258384391\", \"returns.30d.closePriceEndDate\": \"19.25\", \"returns.30d.closePriceStartDate\": \"18.1900005341\", \"returns.30d.endDate\": \"\\\"2020-08-20T00:00:00-04:00\\\"\", \"returns.30d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.30d.ret\": \"0.0582737457\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "dataset_name", "docID", "sentenceID", "sentenceCount", "tickers", "exchanges", "entityType", "sic", "stateOfIncorporation", "tickerCount", "acceptanceDateTime", "form", "reportDate", "returns_1d_closePriceEndDate", "returns_1d_closePriceStartDate", "returns_1d_endDate", "returns_1d_startDate", "returns_1d_ret", "returns_5d_closePriceEndDate", "returns_5d_closePriceStartDate", "returns_5d_endDate", "returns_5d_startDate", "returns_5d_ret", "returns_30d_closePriceEndDate", "returns_30d_closePriceStartDate", "returns_30d_endDate", "returns_30d_startDate", "returns_30d_ret"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "dataset_name": "dataset_name", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount", "tickers": "tickers", "exchanges": "exchanges", "entityType": "entityType", "sic": "sic", "stateOfIncorporation": "stateOfIncorporation", "tickerCount": "tickerCount", "acceptanceDateTime": "acceptanceDateTime", "form": "form", "reportDate": "reportDate", "returns.1d.closePriceEndDate": "returns_1d_closePriceEndDate", "returns.1d.closePriceStartDate": "returns_1d_closePriceStartDate", "returns.1d.endDate": "returns_1d_endDate", "returns.1d.startDate": "returns_1d_startDate", "returns.1d.ret": "returns_1d_ret", "returns.5d.closePriceEndDate": "returns_5d_closePriceEndDate", "returns.5d.closePriceStartDate": "returns_5d_closePriceStartDate", "returns.5d.endDate": "returns_5d_endDate", "returns.5d.startDate": "returns_5d_startDate", "returns.5d.ret": "returns_5d_ret", "returns.30d.closePriceEndDate": "returns_30d_closePriceEndDate", "returns.30d.closePriceStartDate": "returns_30d_closePriceStartDate", "returns.30d.endDate": "returns_30d_endDate", "returns.30d.startDate": "returns_30d_startDate", "returns.30d.ret": "returns_30d_ret"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}}, "tags": ["task_categories:fill-mask", "task_categories:text-classification", "task_ids:masked-language-modeling", "task_ids:multi-class-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "'finance", "financial", "10-K", "10K", "10k", "10-k", "annual", "reports", "sec", "edgar", "sentiment", "firm", "public", "us'"], "is_gated": false}, "bigbio/drugprot": {"dataset_name": "bigbio/drugprot", "description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.", "downloads": 77, "configs": {"drugprot_source": {"config_name": "drugprot_source", "sample_row": "{\"document_id\": \"\\\"17512723\\\"\", \"title\": \"\\\"RDH12, a retinol dehydrogenase causing Leber's co...\", \"abstract\": \"\\\"Three retinol dehydrogenases (RDHs) were tested f...\", \"text\": \"\\\"RDH12, a retinol dehydrogenase causing Leber's co...\", \"entities\": \"[{\\\"id\\\": \\\"17512723_T1\\\", \\\"type\\\": \\\"CHEMICAL\\\", \\\"text\\\":...\", \"relations\": \"[{\\\"id\\\": \\\"17512723_0\\\", \\\"type\\\": \\\"PRODUCT-OF\\\", \\\"arg1_...\"}", "columns": ["document_id", "title", "abstract", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "title": "title", "abstract": "abstract", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.\n", "dataset_name": "bigbio/drugprot"}, "drugprot_bigbio_kb": {"config_name": "drugprot_bigbio_kb", "sample_row": "{\"id\": \"\\\"17512723\\\"\", \"document_id\": \"\\\"17512723\\\"\", \"passages\": \"[{\\\"id\\\": \\\"17512723_title\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\":...\", \"entities\": \"[{\\\"id\\\": \\\"17512723_T1\\\", \\\"type\\\": \\\"CHEMICAL\\\", \\\"text\\\":...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"17512723_0\\\", \\\"type\\\": \\\"PRODUCT-OF\\\", \\\"arg1_...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.\n", "dataset_name": "bigbio/drugprot"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/cpi": {"dataset_name": "bigbio/cpi", "description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships", "downloads": 14, "configs": {"cpi_source": {"config_name": "cpi_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_iv_source": {"config_name": "cpi_iv_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_niv_source": {"config_name": "cpi_niv_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_bigbio_kb": {"config_name": "cpi_bigbio_kb", "sample_row": "{\"id\": \"\\\"DS.d0\\\"\", \"document_id\": \"\\\"17003041\\\"\", \"passages\": \"[{\\\"id\\\": \\\"DS.d0.s0\\\", \\\"text\\\": [\\\"Bestrophin-1 enables...\", \"entities\": \"[{\\\"id\\\": \\\"DS.d0.s0.e0\\\", \\\"type\\\": \\\"protein\\\", \\\"text\\\": ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"DS.d0.s0.i0\\\", \\\"type\\\": \\\"compound-protein-i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "DFKI-SLT/kbp37": {"dataset_name": "DFKI-SLT/kbp37", "description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.", "downloads": 74, "configs": {"kbp37": {"config_name": "kbp37", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\" Thom Yorke of Radiohead has...\", \"relation\": \"27\"}", "columns": ["id", "sentence", "relation"], "columns_mapping": {"id": "id", "sentence": "sentence", "relation": "relation"}, "dataset_description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.\n", "dataset_name": "DFKI-SLT/kbp37"}, "kbp37_formatted": {"config_name": "kbp37_formatted", "sample_row": "{\"id\": \"\\\"0\\\"\", \"token\": \"[\\\"Thom\\\", \\\"Yorke\\\", \\\"of\\\", \\\"Radiohead\\\", \\\"has\\\", \\\"inclu...\", \"e1_start\": \"0\", \"e1_end\": \"2\", \"e2_start\": \"3\", \"e2_end\": \"4\", \"relation\": \"27\"}", "columns": ["id", "token", "e1_start", "e1_end", "e2_start", "e2_end", "relation"], "columns_mapping": {"id": "id", "token": "token", "e1_start": "e1_start", "e1_end": "e1_end", "e2_start": "e2_start", "e2_end": "e2_end", "relation": "relation"}, "dataset_description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.\n", "dataset_name": "DFKI-SLT/kbp37"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "relation extraction"], "is_gated": false}, "metaeval/utilitarianism": {"dataset_name": "metaeval/utilitarianism", "description": "\"\"\"\n_HOMEPAGE = \"\"\n_LICENSE = \"Creative Commons Attribution-NonCommercial 4.0 International Public License\"\n\n# The HuggingFace dataset library don't host the datasets but only point to the original files\n# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)\n_URLs = {\"default\": \"https://www.dropbox.com/s/041prrjylv0tf0h/ethics.zip?dl=1\"}\n\n\nclass Imppres(datasets.GeneratorBasedBuilder):\n\n VERSION = datasets.Version(\"1.1.0\")\n\n def _info(self):\n features = datasets.Features(\n {\n \"better_choice\": datasets.Value(\"string\"),\n \"worst_choice\": datasets.Value(\"string\"),\n \"comparison\": datasets.Value(\"string\"),\n \"label\": datasets.Value(\"int32\"),\n })\n return datasets.DatasetInfo(\n # This is the description that will appear on the datasets page.\n description=_DESCRIPTION,\n # This defines the different columns of the dataset and their types\n features=features, # Here we define them above because they are different between the two configurations\n # If there's a common (input, target) tuple from the features,\n # specify them here. They'll be used if as_supervised=True in\n # builder.as_dataset.\n supervised_keys=None,\n # Homepage of the dataset for documentation\n homepage=_HOMEPAGE,\n # License for the dataset if available\n license=_LICENSE,\n # Citation for the dataset\n citation=_CITATION,\n )\n\n def _split_generators(self, dl_manager):", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"better_choice\": \"\\\"I built a sandcastle with my nephew. We made one ...\", \"worst_choice\": \"\\\"I built a sandcastle with my nephew\\\"\", \"comparison\": \"\\\"\\\\\\\"I built a sandcastle with my nephew. We made on...\", \"label\": \"1\"}", "columns": ["better_choice", "worst_choice", "comparison", "label"], "columns_mapping": {"better_choice": "better_choice", "worst_choice": "worst_choice", "comparison": "comparison", "label": "label"}, "dataset_description": "", "dataset_name": "metaeval/utilitarianism"}}, "tags": [], "is_gated": false}, "shunk031/wrime": {"dataset_name": "shunk031/wrime", "description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.", "downloads": 1084, "configs": {"ver1": {"config_name": "ver1", "sample_row": "{\"sentence\": \"\\\"\\\\u307c\\\\u3051\\\\u3063\\\\u3068\\\\u3057\\\\u3066\\\\u305f\\\\u3089\\\\...\", \"user_id\": \"\\\"1\\\"\", \"datetime\": \"\\\"2012/07/31 23:48\\\"\", \"writer.joy\": \"0\", \"writer.sadness\": \"1\", \"writer.anticipation\": \"2\", \"writer.surprise\": \"1\", \"writer.anger\": \"1\", \"writer.fear\": \"0\", \"writer.disgust\": \"0\", \"writer.trust\": \"1\", \"reader1.joy\": \"0\", \"reader1.sadness\": \"2\", \"reader1.anticipation\": \"0\", \"reader1.surprise\": \"0\", \"reader1.anger\": \"0\", \"reader1.fear\": \"0\", \"reader1.disgust\": \"0\", \"reader1.trust\": \"0\", \"reader2.joy\": \"0\", \"reader2.sadness\": \"2\", \"reader2.anticipation\": \"0\", \"reader2.surprise\": \"1\", \"reader2.anger\": \"0\", \"reader2.fear\": \"0\", \"reader2.disgust\": \"0\", \"reader2.trust\": \"0\", \"reader3.joy\": \"0\", \"reader3.sadness\": \"2\", \"reader3.anticipation\": \"0\", \"reader3.surprise\": \"0\", \"reader3.anger\": \"0\", \"reader3.fear\": \"1\", \"reader3.disgust\": \"1\", \"reader3.trust\": \"0\", \"avg_readers.joy\": \"0\", \"avg_readers.sadness\": \"2\", \"avg_readers.anticipation\": \"0\", \"avg_readers.surprise\": \"0\", \"avg_readers.anger\": \"0\", \"avg_readers.fear\": \"0\", \"avg_readers.disgust\": \"0\", \"avg_readers.trust\": \"0\"}", "columns": ["sentence", "user_id", "datetime", "writer_joy", "writer_sadness", "writer_anticipation", "writer_surprise", "writer_anger", "writer_fear", "writer_disgust", "writer_trust", "reader1_joy", "reader1_sadness", "reader1_anticipation", "reader1_surprise", "reader1_anger", "reader1_fear", "reader1_disgust", "reader1_trust", "reader2_joy", "reader2_sadness", "reader2_anticipation", "reader2_surprise", "reader2_anger", "reader2_fear", "reader2_disgust", "reader2_trust", "reader3_joy", "reader3_sadness", "reader3_anticipation", "reader3_surprise", "reader3_anger", "reader3_fear", "reader3_disgust", "reader3_trust", "avg_readers_joy", "avg_readers_sadness", "avg_readers_anticipation", "avg_readers_surprise", "avg_readers_anger", "avg_readers_fear", "avg_readers_disgust", "avg_readers_trust"], "columns_mapping": {"sentence": "sentence", "user_id": "user_id", "datetime": "datetime", "writer.joy": "writer_joy", "writer.sadness": "writer_sadness", "writer.anticipation": "writer_anticipation", "writer.surprise": "writer_surprise", "writer.anger": "writer_anger", "writer.fear": "writer_fear", "writer.disgust": "writer_disgust", "writer.trust": "writer_trust", "reader1.joy": "reader1_joy", "reader1.sadness": "reader1_sadness", "reader1.anticipation": "reader1_anticipation", "reader1.surprise": "reader1_surprise", "reader1.anger": "reader1_anger", "reader1.fear": "reader1_fear", "reader1.disgust": "reader1_disgust", "reader1.trust": "reader1_trust", "reader2.joy": "reader2_joy", "reader2.sadness": "reader2_sadness", "reader2.anticipation": "reader2_anticipation", "reader2.surprise": "reader2_surprise", "reader2.anger": "reader2_anger", "reader2.fear": "reader2_fear", "reader2.disgust": "reader2_disgust", "reader2.trust": "reader2_trust", "reader3.joy": "reader3_joy", "reader3.sadness": "reader3_sadness", "reader3.anticipation": "reader3_anticipation", "reader3.surprise": "reader3_surprise", "reader3.anger": "reader3_anger", "reader3.fear": "reader3_fear", "reader3.disgust": "reader3_disgust", "reader3.trust": "reader3_trust", "avg_readers.joy": "avg_readers_joy", "avg_readers.sadness": "avg_readers_sadness", "avg_readers.anticipation": "avg_readers_anticipation", "avg_readers.surprise": "avg_readers_surprise", "avg_readers.anger": "avg_readers_anger", "avg_readers.fear": "avg_readers_fear", "avg_readers.disgust": "avg_readers_disgust", "avg_readers.trust": "avg_readers_trust"}, "dataset_description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.\n", "dataset_name": "shunk031/wrime"}, "ver2": {"config_name": "ver2", "sample_row": "{\"sentence\": \"\\\"\\\\u307c\\\\u3051\\\\u3063\\\\u3068\\\\u3057\\\\u3066\\\\u305f\\\\u3089\\\\...\", \"user_id\": \"\\\"1\\\"\", \"datetime\": \"\\\"2012/7/31 23:48\\\"\", \"writer.joy\": \"0\", \"writer.sadness\": \"1\", \"writer.anticipation\": \"2\", \"writer.surprise\": \"1\", \"writer.anger\": \"1\", \"writer.fear\": \"0\", \"writer.disgust\": \"0\", \"writer.trust\": \"1\", \"writer.sentiment\": \"0\", \"reader1.joy\": \"0\", \"reader1.sadness\": \"2\", \"reader1.anticipation\": \"0\", \"reader1.surprise\": \"0\", \"reader1.anger\": \"0\", \"reader1.fear\": \"0\", \"reader1.disgust\": \"0\", \"reader1.trust\": \"0\", \"reader1.sentiment\": \"-2\", \"reader2.joy\": \"0\", \"reader2.sadness\": \"2\", \"reader2.anticipation\": \"0\", \"reader2.surprise\": \"0\", \"reader2.anger\": \"0\", \"reader2.fear\": \"1\", \"reader2.disgust\": \"1\", \"reader2.trust\": \"0\", \"reader2.sentiment\": \"-1\", \"reader3.joy\": \"0\", \"reader3.sadness\": \"2\", \"reader3.anticipation\": \"0\", \"reader3.surprise\": \"1\", \"reader3.anger\": \"0\", \"reader3.fear\": \"0\", \"reader3.disgust\": \"0\", \"reader3.trust\": \"0\", \"reader3.sentiment\": \"-1\", \"avg_readers.joy\": \"0\", \"avg_readers.sadness\": \"2\", \"avg_readers.anticipation\": \"0\", \"avg_readers.surprise\": \"0\", \"avg_readers.anger\": \"0\", \"avg_readers.fear\": \"0\", \"avg_readers.disgust\": \"0\", \"avg_readers.trust\": \"0\", \"avg_readers.sentiment\": \"-1\"}", "columns": ["sentence", "user_id", "datetime", "writer_joy", "writer_sadness", "writer_anticipation", "writer_surprise", "writer_anger", "writer_fear", "writer_disgust", "writer_trust", "writer_sentiment", "reader1_joy", "reader1_sadness", "reader1_anticipation", "reader1_surprise", "reader1_anger", "reader1_fear", "reader1_disgust", "reader1_trust", "reader1_sentiment", "reader2_joy", "reader2_sadness", "reader2_anticipation", "reader2_surprise", "reader2_anger", "reader2_fear", "reader2_disgust", "reader2_trust", "reader2_sentiment", "reader3_joy", "reader3_sadness", "reader3_anticipation", "reader3_surprise", "reader3_anger", "reader3_fear", "reader3_disgust", "reader3_trust", "reader3_sentiment", "avg_readers_joy", "avg_readers_sadness", "avg_readers_anticipation", "avg_readers_surprise", "avg_readers_anger", "avg_readers_fear", "avg_readers_disgust", "avg_readers_trust", "avg_readers_sentiment"], "columns_mapping": {"sentence": "sentence", "user_id": "user_id", "datetime": "datetime", "writer.joy": "writer_joy", "writer.sadness": "writer_sadness", "writer.anticipation": "writer_anticipation", "writer.surprise": "writer_surprise", "writer.anger": "writer_anger", "writer.fear": "writer_fear", "writer.disgust": "writer_disgust", "writer.trust": "writer_trust", "writer.sentiment": "writer_sentiment", "reader1.joy": "reader1_joy", "reader1.sadness": "reader1_sadness", "reader1.anticipation": "reader1_anticipation", "reader1.surprise": "reader1_surprise", "reader1.anger": "reader1_anger", "reader1.fear": "reader1_fear", "reader1.disgust": "reader1_disgust", "reader1.trust": "reader1_trust", "reader1.sentiment": "reader1_sentiment", "reader2.joy": "reader2_joy", "reader2.sadness": "reader2_sadness", "reader2.anticipation": "reader2_anticipation", "reader2.surprise": "reader2_surprise", "reader2.anger": "reader2_anger", "reader2.fear": "reader2_fear", "reader2.disgust": "reader2_disgust", "reader2.trust": "reader2_trust", "reader2.sentiment": "reader2_sentiment", "reader3.joy": "reader3_joy", "reader3.sadness": "reader3_sadness", "reader3.anticipation": "reader3_anticipation", "reader3.surprise": "reader3_surprise", "reader3.anger": "reader3_anger", "reader3.fear": "reader3_fear", "reader3.disgust": "reader3_disgust", "reader3.trust": "reader3_trust", "reader3.sentiment": "reader3_sentiment", "avg_readers.joy": "avg_readers_joy", "avg_readers.sadness": "avg_readers_sadness", "avg_readers.anticipation": "avg_readers_anticipation", "avg_readers.surprise": "avg_readers_surprise", "avg_readers.anger": "avg_readers_anger", "avg_readers.fear": "avg_readers_fear", "avg_readers.disgust": "avg_readers_disgust", "avg_readers.trust": "avg_readers_trust", "avg_readers.sentiment": "avg_readers_sentiment"}, "dataset_description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.\n", "dataset_name": "shunk031/wrime"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:ja", "sentiment-analysis", "wrime"], "is_gated": false}, "lucasmccabe/logiqa": {"dataset_name": "lucasmccabe/logiqa", "description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which are designed to test the civil servant candidates\u2019 critical thinking and problem solving. This dataset includes the English versions only; the Chinese versions are available via the homepage/original source.", "downloads": 560, "configs": {"default": {"config_name": "default", "sample_row": "{\"context\": \"\\\"Some Cantonese don't like chili, so some southern...\", \"query\": \"\\\"Which of the following can guarantee the above ar...\", \"options\": \"[\\\"Some Cantonese love chili.\\\", \\\"Some people who li...\", \"correct_option\": \"2\"}", "columns": ["context", "query", "options", "correct_option"], "columns_mapping": {"context": "context", "query": "query", "options": "options", "correct_option": "correct_option"}, "dataset_description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which are designed to test the civil servant candidates\u2019 critical thinking and problem solving. This dataset includes the English versions only; the Chinese versions are available via the homepage/original source.", "dataset_name": "lucasmccabe/logiqa"}}, "tags": ["task_categories:question-answering", "language:en"], "is_gated": false}, "nlp-thedeep/humset": {"dataset_name": "nlp-thedeep/humset", "description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.", "downloads": 246, "configs": {"1.0.0": {"config_name": "1.0.0", "sample_row": "{\"entry_id\": \"\\\"244334\\\"\", \"lead_id\": \"\\\"47982\\\"\", \"project_id\": \"\\\"2225\\\"\", \"lang\": \"\\\"fr\\\"\", \"n_tokens\": \"84\", \"project_title\": \"\\\"IMMAP/DFS RDC\\\"\", \"created_at\": \"\\\"2020-10-05 03:44:04.532391+00\\\"\", \"document\": \"\\\"https://www.radiookapi.net/2021/01/27/actualite/s...\", \"excerpt\": \"\\\"Le groupe Ma\\\\u00ef-Ma\\\\u00ef L\\\\u00e9opard actif da...\", \"sectors\": \"[]\", \"pillars_1d\": \"[\\\"Context\\\"]\", \"pillars_2d\": \"[]\", \"subpillars_1d\": \"[\\\"Context->Politics\\\", \\\"Context->Security & Stabili...\", \"subpillars_2d\": \"[]\"}", "columns": ["entry_id", "lead_id", "project_id", "lang", "n_tokens", "project_title", "created_at", "document", "excerpt", "sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d"], "columns_mapping": {"entry_id": "entry_id", "lead_id": "lead_id", "project_id": "project_id", "lang": "lang", "n_tokens": "n_tokens", "project_title": "project_title", "created_at": "created_at", "document": "document", "excerpt": "excerpt", "sectors": "sectors", "pillars_1d": "pillars_1d", "pillars_2d": "pillars_2d", "subpillars_1d": "subpillars_1d", "subpillars_2d": "subpillars_2d"}, "dataset_description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.\n", "dataset_name": "nlp-thedeep/humset"}, "2.0.0": {"config_name": "2.0.0", "sample_row": "{\"entry_id\": \"\\\"150780\\\"\", \"lead_id\": \"\\\"37921\\\"\", \"lang\": \"\\\"fr\\\"\", \"n_tokens\": \"662\", \"project_title\": \"\\\"IMMAP/DFS Burkina Faso\\\"\", \"created_at\": \"\\\"2020-08-31 14:28:01.040379+00:00\\\"\", \"document\": \"\\\"Unknown\\\"\", \"source_title\": \"\\\"Displacement Tracking Matrix, IOM\\\"\", \"author_title\": \"\\\"Unknown\\\"\", \"excerpt\": \"\\\"En plus du fret, les deux a\\\\u00e9roports principa...\", \"geo_location\": \"[\\\"Boucle du Mouhoun\\\", \\\"Cascades\\\", \\\"Centre\\\", \\\"Centr...\", \"sectors\": \"[\\\"Logistics\\\"]\", \"pillars_1d\": \"[\\\"Covid-19\\\"]\", \"pillars_2d\": \"[\\\"Impact\\\"]\", \"subpillars_1d\": \"[\\\"Covid-19->Restriction Measures\\\"]\", \"subpillars_2d\": \"[\\\"Impact->Impact On Systems, Services And Networks...\", \"displaced\": \"[]\", \"non_displaced\": \"[]\", \"affected\": \"[]\", \"severity\": \"[]\", \"age\": \"[]\", \"gender\": \"[]\", \"specific_needs_groups\": \"[]\"}", "columns": ["entry_id", "lead_id", "lang", "n_tokens", "project_title", "created_at", "document", "source_title", "author_title", "excerpt", "geo_location", "sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "displaced", "non_displaced", "affected", "severity", "age", "gender", "specific_needs_groups"], "columns_mapping": {"entry_id": "entry_id", "lead_id": "lead_id", "lang": "lang", "n_tokens": "n_tokens", "project_title": "project_title", "created_at": "created_at", "document": "document", "source_title": "source_title", "author_title": "author_title", "excerpt": "excerpt", "geo_location": "geo_location", "sectors": "sectors", "pillars_1d": "pillars_1d", "pillars_2d": "pillars_2d", "subpillars_1d": "subpillars_1d", "subpillars_2d": "subpillars_2d", "displaced": "displaced", "non_displaced": "non_displaced", "affected": "affected", "severity": "severity", "age": "age", "gender": "gender", "specific_needs_groups": "specific_needs_groups"}, "dataset_description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.\n", "dataset_name": "nlp-thedeep/humset"}}, "tags": ["task_categories:text-classification", "task_categories:text-retrieval", "task_categories:token-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:fr", "language:es", "humanitarian", "research", "analytical-framework", "multilabel", "humset", "humbert"], "is_gated": false}, "ruanchaves/hatebr": {"dataset_name": "ruanchaves/hatebr", "description": "HateBR is the first large-scale expert annotated corpus of Brazilian Instagram comments for hate speech and offensive language detection on the web and social media. The HateBR corpus was collected from Brazilian Instagram comments of politicians and manually annotated by specialists. It is composed of 7,000 documents annotated according to three different layers: a binary classification (offensive versus non-offensive comments), offensiveness-level (highly, moderately, and slightly offensive messages), and nine hate speech groups (xenophobia, racism, homophobia, sexism, religious intolerance, partyism, apology for the dictatorship, antisemitism, and fatphobia). Each comment was annotated by three different annotators and achieved high inter-annotator agreement. Furthermore, baseline experiments were implemented reaching 85% of F1-score outperforming the current literature models for the Portuguese language. Accordingly, we hope that the proposed expertly annotated corpus may foster research on hate speech and offensive language detection in the Natural Language Processing area.", "downloads": 24, "configs": {"default": {"config_name": "default", "sample_row": "{\"instagram_comments\": \"\\\"este lixo ...\\\"\", \"offensive_language\": \"true\", \"offensiveness_levels\": \"1\", \"antisemitism\": \"false\", \"apology_for_the_dictatorship\": \"false\", \"fatphobia\": \"false\", \"homophobia\": \"false\", \"partyism\": \"false\", \"racism\": \"false\", \"religious_intolerance\": \"false\", \"sexism\": \"false\", \"xenophobia\": \"false\", \"offensive_&_non-hate_speech\": \"true\", \"non-offensive\": \"false\", \"specialist_1_hate_speech\": \"false\", \"specialist_2_hate_speech\": \"false\", \"specialist_3_hate_speech\": \"false\"}", "columns": ["instagram_comments", "offensive_language", "offensiveness_levels", "antisemitism", "apology_for_the_dictatorship", "fatphobia", "homophobia", "partyism", "racism", "religious_intolerance", "sexism", "xenophobia", "offensive_&_non-hate_speech", "non-offensive", "specialist_1_hate_speech", "specialist_2_hate_speech", "specialist_3_hate_speech"], "columns_mapping": {"instagram_comments": "instagram_comments", "offensive_language": "offensive_language", "offensiveness_levels": "offensiveness_levels", "antisemitism": "antisemitism", "apology_for_the_dictatorship": "apology_for_the_dictatorship", "fatphobia": "fatphobia", "homophobia": "homophobia", "partyism": "partyism", "racism": "racism", "religious_intolerance": "religious_intolerance", "sexism": "sexism", "xenophobia": "xenophobia", "offensive_&_non-hate_speech": "offensive_&_non-hate_speech", "non-offensive": "non-offensive", "specialist_1_hate_speech": "specialist_1_hate_speech", "specialist_2_hate_speech": "specialist_2_hate_speech", "specialist_3_hate_speech": "specialist_3_hate_speech"}, "dataset_description": "\nHateBR is the first large-scale expert annotated corpus of Brazilian Instagram comments for hate speech and offensive language detection on the web and social media. The HateBR corpus was collected from Brazilian Instagram comments of politicians and manually annotated by specialists. It is composed of 7,000 documents annotated according to three different layers: a binary classification (offensive versus non-offensive comments), offensiveness-level (highly, moderately, and slightly offensive messages), and nine hate speech groups (xenophobia, racism, homophobia, sexism, religious intolerance, partyism, apology for the dictatorship, antisemitism, and fatphobia). Each comment was annotated by three different annotators and achieved high inter-annotator agreement. Furthermore, baseline experiments were implemented reaching 85% of F1-score outperforming the current literature models for the Portuguese language. Accordingly, we hope that the proposed expertly annotated corpus may foster research on hate speech and offensive language detection in the Natural Language Processing area.\n", "dataset_name": "ruanchaves/hatebr"}}, "tags": ["task_categories:text-classification", "task_ids:hate-speech-detection", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "instagram", "doi:10.57967/hf/0274"], "is_gated": false}, "bigcode/commitpack": {"dataset_name": "bigcode/commitpack", "description": "CommitPack is is a 4TB dataset of commits scraped from GitHub repositories that are permissively licensed.", "downloads": 141, "configs": {"json": {"config_name": "json", "sample_row": "{\"commit\": \"\\\"13aeb023f68d105d167f308c16957c507967b490\\\"\", \"old_file\": \"\\\"contributors.json\\\"\", \"new_file\": \"\\\"contributors.json\\\"\", \"old_contents\": \"\\\"[\\\\n {\\\\n \\\\\\\"prNum\\\\\\\": 847,\\\\n \\\\\\\"time\\\\\\\": \\\\\\\"2014-08-1...\", \"new_contents\": \"\\\"[\\\\n {\\\\n \\\\\\\"prNum\\\\\\\": 848,\\\\n \\\\\\\"time\\\\\\\": \\\\\\\"2014-08-1...\", \"subject\": \"\\\"Added @5290charlie\\\"\", \"message\": \"\\\"Added @5290charlie\\\"\", \"lang\": \"\\\"JSON\\\"\", \"license\": \"\\\"bsd-2-clause\\\"\", \"repos\": \"\\\"shafayeatsumit/patchwork,jmb521/patchwork,contact...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPack is is a 4TB dataset of commits scraped from GitHub repositories that are permissively licensed.\n", "dataset_name": "bigcode/commitpack"}, "xml": {"config_name": "xml", "sample_row": "{\"commit\": \"\\\"285637f64b964f8aab3866ac6f44549620cdbd20\\\"\", \"old_file\": \"\\\"pom.xml\\\"\", \"new_file\": \"\\\"pom.xml\\\"\", \"old_contents\": \"\\\"\\\\n\\\\r\\\\n

\\\\r\\\\n
\\\\n\\\\n\\\\n\\\\n to examine.\\\"\", \"message\": \"\\\"Fix typo: to examining -> to examine.\\\\n\\\"\", \"lang\": \"\\\"Groff\\\"\", \"license\": \"\\\"bsd-3-clause\\\"\", \"repos\": \"\\\"jrobhoward/SCADAbase,jrobhoward/SCADAbase,jrobhow...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPackFT is is a 2GB filtered version of CommitPack to contain only high-quality commit messages that resemble natural language instructions.\n", "dataset_name": "bigcode/commitpackft"}, "groovy": {"config_name": "groovy", "sample_row": "{\"commit\": \"\\\"d4967cd2b865160ad756ae143af14772a286d255\\\"\", \"old_file\": \"\\\"subprojects/integ-test/src/integTest/groovy/org/g...\", \"new_file\": \"\\\"subprojects/integ-test/src/integTest/groovy/org/g...\", \"old_contents\": \"\\\"/*\\\\n * Copyright 2013 the original author or auth...\", \"new_contents\": \"\\\"/*\\\\n * Copyright 2013 the original author or auth...\", \"subject\": \"\\\"Fix usage of now removed 'cpp-lib' and 'cpp-exe' ...\", \"message\": \"\\\"Fix usage of now removed 'cpp-lib' and 'cpp-exe' ...\", \"lang\": \"\\\"Groovy\\\"\", \"license\": \"\\\"apache-2.0\\\"\", \"repos\": \"\\\"gradle/gradle,blindpirate/gradle,blindpirate/grad...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPackFT is is a 2GB filtered version of CommitPack to contain only high-quality commit messages that resemble natural language instructions.\n", "dataset_name": "bigcode/commitpackft"}, "groovy-server-pages": {"config_name": "groovy-server-pages", "sample_row": "{\"commit\": \"\\\"d8917646df8c673944db19176a58c0e38a8c076e\\\"\", \"old_file\": \"\\\"grails-app/views/home/templates/_projects.gsp\\\"\", \"new_file\": \"\\\"grails-app/views/home/templates/_projects.gsp\\\"\", \"old_contents\": \"\\\"