update some datasets to hf hub

edadaltocg · edadaltocg · Jan 2, 2024 · Jun 1, 2023 · Jun 7, 2023 · Jul 3, 2023
commit 14aa3b2ac0468ffddeb25789f6264dc2b5c75e5c
diff --git a/scripts/compute_accuracy.py b/scripts/compute_accuracy.py
diff --git a/scripts/parse_arxiv.py b/scripts/parse_arxiv.py
@@ -0,0 +1,80 @@
+"""Requirements
+    - feedparser installed: pip install feedparser
+"""
+import os
+import argparse
+import urllib
+import urllib.request
+import feedparser
+import json
+import logging
+
+
+logging.basicConfig(
+    format="%(asctime)s:%(levelname)s:%(filename)s:%(lineno)s-%(funcName)s: %(message)s",
+    datefmt="%H:%M:%S",
+    level=logging.INFO,
+)
+_logger = logging.getLogger(__name__)
+
+
+def get_bibtex(feed: dict):
+    lines = ["@article{" + feed["id"].split("/")[-1]]
+    for k, v in [
+        ("author", " and ".join([a["name"] for a in feed["authors"]])),
+        ("title", feed["title"]),
+        # ("PrimaryClass", feed["category"]),
+        # ("Abstract", feed["summary"]),
+        ("year", str(feed["published_parsed"][0])),
+        ("month", str(feed["published_parsed"][1])),
+        ("note", feed["arxiv_comment"]),
+        ("archiveprefix", "arXiv"),
+        ("url", feed["link"]),
+    ]:
+        if len(v):
+            lines.append("%-13s = {%s}" % (k, v))
+
+    return ("," + os.linesep).join(lines) + os.linesep + "}"
+
+
+def main(id, verbose=False):
+    id = id.split("/")[-1]
+    url = f"http://export.arxiv.org/api/query?search_query=all:{id}&start=0&max_results=1"
+
+    response = urllib.request.urlopen(url)
+    data = response.read().decode("utf-8")
+    feed = feedparser.parse(data)["entries"][0]
+
+    json_feed = json.dumps(feed, indent=2)
+    _logger.debug(json_feed)
+
+    title = feed["title"].replace("\n", "")
+    authors = ", ".join(author["name"] for author in feed["authors"])
+    link = feed["link"]
+    abstract = feed["summary"]
+    bibtext = get_bibtex(feed)
+
+    _logger.debug(f"{title}\nby {authors} ({link})")
+    _logger.debug(bibtext)
+
+    obj = {
+        "title": title,
+        "authors": authors,
+        "link": link,
+        "abstract": abstract,
+        "bibtext": bibtext,
+    }
+    if verbose:
+        print(json.dumps(obj, indent=2))
+    return obj
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-id", "--id", type=str, default="2203.07798")
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+
+    _logger.setLevel(logging.DEBUG if args.debug else logging.INFO)
+    main(args.id, args.verbose)
diff --git a/scripts/push_dataset_to_hf_hub.py b/scripts/push_dataset_to_hf_hub.py
@@ -0,0 +1,164 @@
+"""Requirements:
+
+- HF_TOKEN environment variable set to your HuggingFace token
+- HF Datasets installed
+- Jinja2 installed
+- Git LFS installed
+"""
+
+import argparse
+import os
+
+import datasets
+import detectors
+import parse_arxiv
+from datasets import Dataset
+from huggingface_hub.hf_api import create_repo
+from huggingface_hub.repocard import DatasetCard, DatasetCardData
+
+
+def main(
+    dataset: Dataset,
+    dataset_name,
+    pretty_dataset_name,
+    license,
+    hf_id="detectors",
+    private=False,
+    **kwargs,
+):
+    token = os.environ["HF_TOKEN"]
+    n = len(dataset)
+    size_categories = "n<1K" if n < 1000 else "1K<n<10K" if n < 10000 else "10K<n<100K" if n < 100000 else "n>100K"
+    original_dataset_url = kwargs.get("url", None)
+    original_paper_url = kwargs.get("original_paper_url", None)
+    paper = kwargs.get("paper", None)
+    authors = kwargs.get("authors", None)
+    original_authors = kwargs.get("original_authors", None)
+    original_citation_bibtex = kwargs.get("original_citation_bibtex", "")
+    citation_bibtex = kwargs.get("citation_bibtex", "")
+    paperswithcode_id = kwargs.get("paperswithcode_id", None)
+    repo = None
+    demo = None
+    curators = "Eduardo Dadalto"
+    dataset_card_authors = curators
+    dataset_card_contact = "https://huggingface.co/edadaltocg"
+    direct_use = (
+        "This dataset is intended to be used as an ouf-of-distribution dataset for image classification benchmarks."
+    )
+    curation_rationale_section = """The goal in curating and sharing this dataset to the HuggingFace Hub is to accelerate research and promote reproducibility in generalized Out-of-Distribution (OOD) detection.
+
+Check the python library [detectors](https://github.com/edadaltocg/detectors) if you are interested in OOD detection."""
+    out_of_scope_use = "This dataset is not annotated."
+    personal_and_sensitive_information = "Please check original paper for details on the dataset."
+    bias_risks_limitations = "Please check original paper for details on the dataset."
+
+    citation_bibtex = (
+        """@software{detectors2023,
+author = {Eduardo Dadalto},
+title = {Detectors: a Python Library for Generalized Out-Of-Distribution Detection},
+url = {https://github.com/edadaltocg/detectors},
+doi = {https://doi.org/10.5281/zenodo.7883596},
+month = {5},
+year = {2023}
+}"""
+        + "\n\n"
+        + citation_bibtex
+        or "" + "\n\n" + original_citation_bibtex
+        or ""
+    ).strip()
+
+    card_data = DatasetCardData(
+        task_categories=["image-classification"],
+        pretty_name=pretty_dataset_name,
+        license=license,
+        size_categories=size_categories,
+        paperswithcode_id=paperswithcode_id if len(paperswithcode_id) > 1 else None,
+    )
+    dataset_card = DatasetCard.from_template(
+        card_data=card_data,
+        template_path=os.path.join("templates", "DATASET_CARD_TEMPLATE.md"),
+        paper=paper,
+        original_paper_url=original_paper_url,
+        demo=demo,
+        repo=repo,
+        dataset_card_authors=dataset_card_authors,
+        dataset_card_contact=dataset_card_contact,
+        curators=curators,
+        direct_use=direct_use,
+        curation_rationale_section=curation_rationale_section,
+        out_of_scope_use=out_of_scope_use,
+        personal_and_sensitive_information=personal_and_sensitive_information,
+        bias_risks_limitations=bias_risks_limitations,
+        citation_bibtex=citation_bibtex,
+        original_dataset_url=original_dataset_url,
+        authors=authors,
+        original_authors=original_authors,
+    )
+
+    repo_id = f"{dataset_name}-ood"
+    tag = f"{hf_id}/{repo_id}"
+    create_repo(repo_id=tag, exist_ok=True, token=token, private=private, repo_type="dataset")
+    dataset_card.push_to_hub(tag, token=token)
+    dataset.push_to_hub(tag, token=token)
+
+    # test
+    dataset = datasets.load_dataset(tag, split="train")
+    assert len(dataset) == n
+    for x in dataset:
+        print(x)
+        assert "image" in x
+        break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_name", type=str, default="ssb_easy")
+    parser.add_argument("--pretty_dataset_name", type=str, default=None)
+    parser.add_argument("--paperswithcode_id", type=str, default=None)
+
+    parser.add_argument("--hf_id", type=str, default=None)
+    parser.add_argument("--private", action="store_true")
+    args = parser.parse_args()
+
+    dataset = detectors.create_dataset(args.dataset_name)
+
+    try:
+        arxiv = dataset.paper_url
+        parsed = parse_arxiv.main(arxiv)
+    except AttributeError:
+        parsed = {"link": None, "authors": None, "bibtext": None}
+
+    try:
+        license = dataset.license
+    except:
+        license = "unknown"
+
+    try:
+        url = dataset.url
+    except:
+        url = None
+
+    try:
+        original_paper_url = dataset.original_paper_url
+        original_parsed = parse_arxiv.main(original_paper_url)
+    except:
+        original_parsed = {"link": None, "authors": None, "bibtext": None}
+
+    kwargs = {
+        "original_paper_url": original_parsed["link"],
+        "paper": parsed["link"],
+        "original_authors": original_parsed["authors"],
+        "authors": parsed["authors"],
+        "citation_bibtex": parsed["bibtext"],
+        "original_citation_bibtex": original_parsed["bibtext"],
+        "url": url,
+        "paperswithcode_id": args.paperswithcode_id,
+    }
+
+    def gen():
+        for x, y in dataset:
+            yield {"image": x}
+
+    dataset = Dataset.from_generator(gen)
+
+    main(dataset, args.dataset_name, args.pretty_dataset_name, license=license, private=args.private, **kwargs)
diff --git a/src/detectors/data/__init__.py b/src/detectors/data/__init__.py
@@ -43,6 +43,7 @@
 from .wilds_ds import make_wilds_dataset
 
 _logger = logging.getLogger(__name__)
+
 datasets_registry = {
     "cifar10": CIFAR10Wrapped,
     "cifar100": CIFAR100Wrapped,
@@ -70,6 +71,8 @@
     "mos_inaturalist": MOSiNaturalist,
     "mos_places365": MOSPlaces365,
     "mos_sun": MOSSUN,
+    "inaturalist": MOSiNaturalist,
+    "sun": MOSSUN,
     "ninco_full": NINCOFull,
     "ninco": NINCO,
     "ssb_hard": SSBHard,

diff --git a/src/detectors/data/isun.py b/src/detectors/data/isun.py
@@ -7,7 +7,7 @@
 
 
 class iSUN(ImageFolder):
-    """`iSUN <ODIN_PAPER_URL>`_ Dataset subset.
+    """iSUN Dataset.
 
     Args:
         root (string): Root directory of dataset where directory
@@ -25,6 +25,9 @@ class iSUN(ImageFolder):
     filename = "iSUN.tar.gz"
     file_md5 = "be77b0f2c26fda898afac5f99645ee70"
     url = "https://www.dropbox.com/s/ssz7qxfqae0cca5/iSUN.tar.gz"
+    original_paper_url = "https://arxiv.org/abs/1507.01422"
+    paper_url = "https://arxiv.org/abs/1706.02690"
+    license = "unknown"
     # size 8925
 
     def __init__(