Add script to generate search index of datasets from HuggingFace (#238)

* Move mock retrieval index utilities into separate file * Add script to generate search index of datasets from HuggingFace * Add dataset index file produced by the retrieve_dataset_info.py script * Remove API key from code * Update prompt2model/dataset_retriever/retrieve_dataset_info.py Co-authored-by: Graham Neubig <[email protected]> * Update prompt2model/dataset_retriever/retrieve_dataset_info.py Co-authored-by: Graham Neubig <[email protected]> * Update prompt2model/dataset_retriever/retrieve_dataset_info.py Co-authored-by: Graham Neubig <[email protected]> * Move SUPPORTED_TASKS global variable to top of file for visibility * Remove dependence on model-evaluator submodule * Remove evaluation metadata from retrieve_dataset_info script * Untrack pre-constructed dataset search index from repo * Remove obsolete import --------- Co-authored-by: Graham Neubig <[email protected]>
neulab · Aug 22, 2023 · 95f3c61 · 95f3c61
1 parent f6c3949
commit 95f3c61
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 6 deletions.
diff --git a/.gitmodules b/.gitmodules
diff --git a/prompt2model/dataset_retriever/retrieve_dataset_info.py b/prompt2model/dataset_retriever/retrieve_dataset_info.py
@@ -0,0 +1,70 @@
+"""Tools for retrieving dataset metadata from HuggingFace.
+
+Before calling this script, set the HF_USER_ACCESS_TOKEN environment variable.
+"""
+
+from __future__ import annotations  # noqa FI58
+
+import argparse
+import json
+
+import requests
+from huggingface_hub import list_datasets
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--dataset-index-file",
+    type=str,
+    default="huggingface_data/huggingface_datasets/dataset_index.json",
+)
+
+
+def get_fully_supported_dataset_names():
+    """Get the list of loadable datasets from HuggingFace."""
+    API_URL = "https://datasets-server.huggingface.co/valid"
+    response = requests.get(API_URL)
+    datasets_list = response.json()
+    fully_supported_datasets = datasets_list["viewer"] + datasets_list["preview"]
+    return fully_supported_datasets
+
+
+def construct_search_documents(
+    all_dataset_names,
+    all_dataset_descriptions,
+    fully_supported_dataset_names,
+    minimum_description_length=4,
+):
+    """Select the datasets and corresponding descriptions to store in our index."""
+    filtered_dataset_names = []
+    nonempty_descriptions = []
+    for dataset_name, description in zip(all_dataset_names, all_dataset_descriptions):
+        if dataset_name not in fully_supported_dataset_names:
+            continue
+        if (
+            description is not None
+            and len(description.split()) > minimum_description_length
+        ):
+            filtered_dataset_names.append(dataset_name)
+            nonempty_descriptions.append(description)
+    return filtered_dataset_names, nonempty_descriptions
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    fully_supported_dataset_names = get_fully_supported_dataset_names()
+    all_datasets = list(list_datasets())
+    dataset_names = [dataset.id for dataset in all_datasets]
+    dataset_descriptions = [dataset.description for dataset in all_datasets]
+
+    filtered_dataset_names, filtered_descriptions = construct_search_documents(
+        dataset_names, dataset_descriptions, fully_supported_dataset_names
+    )
+    dataset_index = {}
+    for name, description in zip(filtered_dataset_names, filtered_descriptions):
+        dataset_index[name] = {
+            "name": name,
+            "description": description,
+        }
+
+    json.dump(dataset_index, open(args.dataset_index_file, "w"))
diff --git a/test_helpers/mock_retrieval.py b/test_helpers/mock_retrieval.py
@@ -18,9 +18,3 @@ def create_test_search_index(index_file_name: str) -> None:
     mock_lookup_indices = [0, 1, 2]
     with open(index_file_name, "wb") as f:
         pickle.dump((mock_model_encodings, mock_lookup_indices), f)
-
-
-def create_test_search_index_class_method(self, index_file_name: str) -> None:
-    """Utility function to create a test search index as a simulated class method."""
-    _ = self
-    create_test_search_index(index_file_name)