Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix typo #17

Merged
merged 27 commits into from
Jan 2, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update some datasets to hf hub
edadaltocg committed Oct 31, 2023
commit 14aa3b2ac0468ffddeb25789f6264dc2b5c75e5c
95 changes: 0 additions & 95 deletions scripts/compute_accuracy.py

This file was deleted.

80 changes: 80 additions & 0 deletions scripts/parse_arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Requirements
- feedparser installed: pip install feedparser
"""
import os
import argparse
import urllib
import urllib.request
import feedparser
import json
import logging


logging.basicConfig(
format="%(asctime)s:%(levelname)s:%(filename)s:%(lineno)s-%(funcName)s: %(message)s",
datefmt="%H:%M:%S",
level=logging.INFO,
)
_logger = logging.getLogger(__name__)


def get_bibtex(feed: dict):
lines = ["@article{" + feed["id"].split("/")[-1]]
for k, v in [
("author", " and ".join([a["name"] for a in feed["authors"]])),
("title", feed["title"]),
# ("PrimaryClass", feed["category"]),
# ("Abstract", feed["summary"]),
("year", str(feed["published_parsed"][0])),
("month", str(feed["published_parsed"][1])),
("note", feed["arxiv_comment"]),
("archiveprefix", "arXiv"),
("url", feed["link"]),
]:
if len(v):
lines.append("%-13s = {%s}" % (k, v))

return ("," + os.linesep).join(lines) + os.linesep + "}"


def main(id, verbose=False):
id = id.split("/")[-1]
url = f"http://export.arxiv.org/api/query?search_query=all:{id}&start=0&max_results=1"

response = urllib.request.urlopen(url)
data = response.read().decode("utf-8")
feed = feedparser.parse(data)["entries"][0]

json_feed = json.dumps(feed, indent=2)
_logger.debug(json_feed)

title = feed["title"].replace("\n", "")
authors = ", ".join(author["name"] for author in feed["authors"])
link = feed["link"]
abstract = feed["summary"]
bibtext = get_bibtex(feed)

_logger.debug(f"{title}\nby {authors} ({link})")
_logger.debug(bibtext)

obj = {
"title": title,
"authors": authors,
"link": link,
"abstract": abstract,
"bibtext": bibtext,
}
if verbose:
print(json.dumps(obj, indent=2))
return obj


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-id", "--id", type=str, default="2203.07798")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()

_logger.setLevel(logging.DEBUG if args.debug else logging.INFO)
main(args.id, args.verbose)
164 changes: 164 additions & 0 deletions scripts/push_dataset_to_hf_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Requirements:

- HF_TOKEN environment variable set to your HuggingFace token
- HF Datasets installed
- Jinja2 installed
- Git LFS installed
"""

import argparse
import os

import datasets
import detectors
import parse_arxiv
from datasets import Dataset
from huggingface_hub.hf_api import create_repo
from huggingface_hub.repocard import DatasetCard, DatasetCardData


def main(
dataset: Dataset,
dataset_name,
pretty_dataset_name,
license,
hf_id="detectors",
private=False,
**kwargs,
):
token = os.environ["HF_TOKEN"]
n = len(dataset)
size_categories = "n<1K" if n < 1000 else "1K<n<10K" if n < 10000 else "10K<n<100K" if n < 100000 else "n>100K"
original_dataset_url = kwargs.get("url", None)
original_paper_url = kwargs.get("original_paper_url", None)
paper = kwargs.get("paper", None)
authors = kwargs.get("authors", None)
original_authors = kwargs.get("original_authors", None)
original_citation_bibtex = kwargs.get("original_citation_bibtex", "")
citation_bibtex = kwargs.get("citation_bibtex", "")
paperswithcode_id = kwargs.get("paperswithcode_id", None)
repo = None
demo = None
curators = "Eduardo Dadalto"
dataset_card_authors = curators
dataset_card_contact = "https://huggingface.co/edadaltocg"
direct_use = (
"This dataset is intended to be used as an ouf-of-distribution dataset for image classification benchmarks."
)
curation_rationale_section = """The goal in curating and sharing this dataset to the HuggingFace Hub is to accelerate research and promote reproducibility in generalized Out-of-Distribution (OOD) detection.

Check the python library [detectors](https://github.com/edadaltocg/detectors) if you are interested in OOD detection."""
out_of_scope_use = "This dataset is not annotated."
personal_and_sensitive_information = "Please check original paper for details on the dataset."
bias_risks_limitations = "Please check original paper for details on the dataset."

citation_bibtex = (
"""@software{detectors2023,
author = {Eduardo Dadalto},
title = {Detectors: a Python Library for Generalized Out-Of-Distribution Detection},
url = {https://github.com/edadaltocg/detectors},
doi = {https://doi.org/10.5281/zenodo.7883596},
month = {5},
year = {2023}
}"""
+ "\n\n"
+ citation_bibtex
or "" + "\n\n" + original_citation_bibtex
or ""
).strip()

card_data = DatasetCardData(
task_categories=["image-classification"],
pretty_name=pretty_dataset_name,
license=license,
size_categories=size_categories,
paperswithcode_id=paperswithcode_id if len(paperswithcode_id) > 1 else None,
)
dataset_card = DatasetCard.from_template(
card_data=card_data,
template_path=os.path.join("templates", "DATASET_CARD_TEMPLATE.md"),
paper=paper,
original_paper_url=original_paper_url,
demo=demo,
repo=repo,
dataset_card_authors=dataset_card_authors,
dataset_card_contact=dataset_card_contact,
curators=curators,
direct_use=direct_use,
curation_rationale_section=curation_rationale_section,
out_of_scope_use=out_of_scope_use,
personal_and_sensitive_information=personal_and_sensitive_information,
bias_risks_limitations=bias_risks_limitations,
citation_bibtex=citation_bibtex,
original_dataset_url=original_dataset_url,
authors=authors,
original_authors=original_authors,
)

repo_id = f"{dataset_name}-ood"
tag = f"{hf_id}/{repo_id}"
create_repo(repo_id=tag, exist_ok=True, token=token, private=private, repo_type="dataset")
dataset_card.push_to_hub(tag, token=token)
dataset.push_to_hub(tag, token=token)

# test
dataset = datasets.load_dataset(tag, split="train")
assert len(dataset) == n
for x in dataset:
print(x)
assert "image" in x
break


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_name", type=str, default="ssb_easy")
parser.add_argument("--pretty_dataset_name", type=str, default=None)
parser.add_argument("--paperswithcode_id", type=str, default=None)

parser.add_argument("--hf_id", type=str, default=None)
parser.add_argument("--private", action="store_true")
args = parser.parse_args()

dataset = detectors.create_dataset(args.dataset_name)

try:
arxiv = dataset.paper_url
parsed = parse_arxiv.main(arxiv)
except AttributeError:
parsed = {"link": None, "authors": None, "bibtext": None}

try:
license = dataset.license
except:
license = "unknown"

try:
url = dataset.url
except:
url = None

try:
original_paper_url = dataset.original_paper_url
original_parsed = parse_arxiv.main(original_paper_url)
except:
original_parsed = {"link": None, "authors": None, "bibtext": None}

kwargs = {
"original_paper_url": original_parsed["link"],
"paper": parsed["link"],
"original_authors": original_parsed["authors"],
"authors": parsed["authors"],
"citation_bibtex": parsed["bibtext"],
"original_citation_bibtex": original_parsed["bibtext"],
"url": url,
"paperswithcode_id": args.paperswithcode_id,
}

def gen():
for x, y in dataset:
yield {"image": x}

dataset = Dataset.from_generator(gen)

main(dataset, args.dataset_name, args.pretty_dataset_name, license=license, private=args.private, **kwargs)
3 changes: 3 additions & 0 deletions src/detectors/data/__init__.py
Original file line number Diff line number Diff line change
@@ -43,6 +43,7 @@
from .wilds_ds import make_wilds_dataset

_logger = logging.getLogger(__name__)

datasets_registry = {
"cifar10": CIFAR10Wrapped,
"cifar100": CIFAR100Wrapped,
@@ -70,6 +71,8 @@
"mos_inaturalist": MOSiNaturalist,
"mos_places365": MOSPlaces365,
"mos_sun": MOSSUN,
"inaturalist": MOSiNaturalist,
"sun": MOSSUN,
"ninco_full": NINCOFull,
"ninco": NINCO,
"ssb_hard": SSBHard,
5 changes: 4 additions & 1 deletion src/detectors/data/isun.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@


class iSUN(ImageFolder):
"""`iSUN <ODIN_PAPER_URL>`_ Dataset subset.
"""iSUN Dataset.

Args:
root (string): Root directory of dataset where directory
@@ -25,6 +25,9 @@ class iSUN(ImageFolder):
filename = "iSUN.tar.gz"
file_md5 = "be77b0f2c26fda898afac5f99645ee70"
url = "https://www.dropbox.com/s/ssz7qxfqae0cca5/iSUN.tar.gz"
original_paper_url = "https://arxiv.org/abs/1507.01422"
paper_url = "https://arxiv.org/abs/1706.02690"
license = "unknown"
# size 8925

def __init__(
Loading