Skip to content

Commit

Permalink
Add script to generate search index of datasets from HuggingFace (#238)
Browse files Browse the repository at this point in the history
* Move mock retrieval index utilities into separate file

* Add script to generate search index of datasets from HuggingFace

* Add dataset index file produced by the retrieve_dataset_info.py script

* Remove API key from code

* Update prompt2model/dataset_retriever/retrieve_dataset_info.py

Co-authored-by: Graham Neubig <[email protected]>

* Update prompt2model/dataset_retriever/retrieve_dataset_info.py

Co-authored-by: Graham Neubig <[email protected]>

* Update prompt2model/dataset_retriever/retrieve_dataset_info.py

Co-authored-by: Graham Neubig <[email protected]>

* Move SUPPORTED_TASKS global variable to top of file for visibility

* Remove dependence on model-evaluator submodule

* Remove evaluation metadata from retrieve_dataset_info script

* Untrack pre-constructed dataset search index from repo

* Remove obsolete import

---------

Co-authored-by: Graham Neubig <[email protected]>
  • Loading branch information
viswavi and neubig authored Aug 22, 2023
1 parent f6c3949 commit 95f3c61
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 6 deletions.
Empty file added .gitmodules
Empty file.
70 changes: 70 additions & 0 deletions prompt2model/dataset_retriever/retrieve_dataset_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Tools for retrieving dataset metadata from HuggingFace.
Before calling this script, set the HF_USER_ACCESS_TOKEN environment variable.
"""

from __future__ import annotations # noqa FI58

import argparse
import json

import requests
from huggingface_hub import list_datasets

parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset-index-file",
type=str,
default="huggingface_data/huggingface_datasets/dataset_index.json",
)


def get_fully_supported_dataset_names():
"""Get the list of loadable datasets from HuggingFace."""
API_URL = "https://datasets-server.huggingface.co/valid"
response = requests.get(API_URL)
datasets_list = response.json()
fully_supported_datasets = datasets_list["viewer"] + datasets_list["preview"]
return fully_supported_datasets


def construct_search_documents(
all_dataset_names,
all_dataset_descriptions,
fully_supported_dataset_names,
minimum_description_length=4,
):
"""Select the datasets and corresponding descriptions to store in our index."""
filtered_dataset_names = []
nonempty_descriptions = []
for dataset_name, description in zip(all_dataset_names, all_dataset_descriptions):
if dataset_name not in fully_supported_dataset_names:
continue
if (
description is not None
and len(description.split()) > minimum_description_length
):
filtered_dataset_names.append(dataset_name)
nonempty_descriptions.append(description)
return filtered_dataset_names, nonempty_descriptions


if __name__ == "__main__":
args = parser.parse_args()

fully_supported_dataset_names = get_fully_supported_dataset_names()
all_datasets = list(list_datasets())
dataset_names = [dataset.id for dataset in all_datasets]
dataset_descriptions = [dataset.description for dataset in all_datasets]

filtered_dataset_names, filtered_descriptions = construct_search_documents(
dataset_names, dataset_descriptions, fully_supported_dataset_names
)
dataset_index = {}
for name, description in zip(filtered_dataset_names, filtered_descriptions):
dataset_index[name] = {
"name": name,
"description": description,
}

json.dump(dataset_index, open(args.dataset_index_file, "w"))
6 changes: 0 additions & 6 deletions test_helpers/mock_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,3 @@ def create_test_search_index(index_file_name: str) -> None:
mock_lookup_indices = [0, 1, 2]
with open(index_file_name, "wb") as f:
pickle.dump((mock_model_encodings, mock_lookup_indices), f)


def create_test_search_index_class_method(self, index_file_name: str) -> None:
"""Utility function to create a test search index as a simulated class method."""
_ = self
create_test_search_index(index_file_name)

0 comments on commit 95f3c61

Please sign in to comment.