update imagenet benchmarks

edadaltocg · edadaltocg · Jun 19, 2024 · Jun 1, 2023 · Jun 7, 2023 · Jul 3, 2023
commit 14c754151082dc0dd5bf05f6b123120e790fa9e0
diff --git a/src/detectors/data/__init__.py b/src/detectors/data/__init__.py
@@ -2,13 +2,16 @@
 Datasets module.
 """
 import logging
+import os
 from enum import Enum
 from functools import partial
 from typing import Callable, List, Optional, Type
 
 from torch.utils.data import Dataset
 from torchvision.datasets import STL10, SVHN, ImageNet, OxfordIIITPet, StanfordCars
 
+from .ninco_ssb_clean import NINCO, NINCOFull, SSBEasy, SSBHard, TexturesClean
+
 from ..config import DATA_DIR, IMAGENET_ROOT
 from .cifar_wrapper import CIFAR10Wrapped, CIFAR100Wrapped
 from .cifarc import CIFAR10_C, CIFAR100_C
@@ -57,6 +60,11 @@
     "mos_inaturalist": MOSiNaturalist,
     "mos_places365": MOSPlaces365,
     "mos_sun": MOSSUN,
+    "ninco_full": NINCOFull,
+    "ninco": NINCO,
+    "ssb_hard": SSBHard,
+    "ssb_easy": SSBEasy,
+    "textures_clean": TexturesClean,
     "cifar10_lt": None,
     "cifar100_lt": None,
     "imagenet1k_lt": None,
@@ -130,7 +138,7 @@ def create_dataset(
                 `imagenet_a`, `imagenet_r`, `imagenet_o`, `openimage_o`, `oxford_pets`,
                 `oxford_flowers`, `cub200`, `imagenet1k_c`, `blobs`, `rademacher`,
                 `wilds_iwildcam`, `wilds_fmow`, `wilds_camelyon17`, `wilds_rxrx1`,
-                `wilds_poverty`, `wilds_globalwheat`.
+                `wilds_poverty`, `wilds_globalwheat`, `ninco`.
         root (string): Root directory of dataset.
         split (string, optional): Depends on the selected dataset.
         transform (callable, optional): A function/transform that  takes in an PIL image
@@ -155,6 +163,23 @@ def create_dataset(
         raise ValueError("Dataset name is not specified")
 
 
+def delete_dataset(dataset_name: str, root: str = DATA_DIR):
+    dataset_cls = datasets_registry[dataset_name]
+    try:
+        os.remove(os.path.join(root, dataset_cls.filename))
+    except FileNotFoundError:
+        print(f"File {dataset_cls.filename} not found")
+    except Exception as e:
+        print(e)
+
+    try:
+        os.remove(os.path.join(root, dataset_cls.base_folder))
+    except FileNotFoundError:
+        print(f"Folder {dataset_cls.base_folder} not found")
+    except Exception as e:
+        print(e)
+
+
 def get_dataset_cls(dataset_name: str) -> Type[Dataset]:
     """Return dataset class by name.
 

diff --git a/src/detectors/data/imagenet.py b/src/detectors/data/imagenet.py
@@ -24,7 +24,6 @@ class ImageNetA(ImageFolder):
     tgz_md5 = "c3e55429088dc681f30d81f4726b6595"
 
     def __init__(self, root: str, split=None, transform: Optional[Callable] = None, download: bool = False, **kwargs):
-
         self.root = root
 
         if download:
@@ -175,7 +174,6 @@ def _get_corruption_group(corruption: str):
 
 
 def _imagenet_c_to_npz(root: str, split: str, intensity: int, dest_folder: str = "ImageNetCnpz") -> None:
-
     dataset = ImageNetC(root, split, intensity, download=True)
     assert len(dataset) == 50_000, "ImageNetC should have 50,000 images. Please check the dataset."
     image_example = dataset[0][0]

diff --git a/src/detectors/data/openimage_o.py b/src/detectors/data/openimage_o.py
@@ -42,7 +42,7 @@ def __init__(
         super().__init__(self.dataset_folder, transform=transform, **kwargs)
 
     def _check_integrity(self) -> bool:
-        # assert number of iumages in folder is equal to 17632
+        # assert number of images in folder is equal to 17632
         if not self._check_exists():
             return False
 
@@ -52,7 +52,7 @@ def _check_integrity(self) -> bool:
             # check if current path is a file
             if os.path.isfile(os.path.join(self.dataset_folder, self.base_folder, path)):
                 count += 1
-        return count >= 16_000
+        return count >= 10_000
 
     def _check_exists(self) -> bool:
         return os.path.exists(self.dataset_folder)

diff --git a/src/detectors/data/utils.py b/src/detectors/data/utils.py
@@ -0,0 +1,65 @@
+import logging
+import os
+from typing import Callable, Optional
+
+import numpy as np
+from PIL import Image
+from torch.utils.data.dataset import Dataset
+from tqdm import tqdm
+
+_logger = logging.getLogger(__name__)
+
+
+def image_dataset_to_npz(DatasetCls, root: str, split: str, **kwargs) -> None:
+    dataset = DatasetCls(root, split, download=True, **kwargs)
+    dest_folder = DatasetCls.base_folder + "_npz"
+    image_example = dataset[0][0]
+    width, height = image_example.size
+    _logger.info("Image size: %d x %d", width, height)
+    x = np.ndarray(shape=(len(dataset), height, width, 3), dtype=np.uint8)
+    y = np.ndarray(shape=(len(dataset)), dtype=np.int32)
+    for i in tqdm(range(len(dataset))):
+        image, label = dataset[i]
+        x[i] = image
+        y[i] = label
+
+    os.makedirs(os.path.join(root, dest_folder), exist_ok=True)
+    np.savez(os.path.join(root, dest_folder, f"{split}.npz"), x=x, y=y)
+
+
+class DatasetNpz(Dataset):
+    def __init__(
+        self,
+        root: str,
+        base_folder_name: str,
+        split: str,
+        transform: Optional[Callable] = None,
+        download: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.root = os.path.expanduser(root)
+        self.base_folder_name = base_folder_name
+        self.split = split
+        self.path = os.path.join(self.root, self.base_folder_name, f"{split}.npz")
+        self.transform = transform
+
+        data = np.load(self.path, mmap_mode="r")
+        self.images = data["x"]
+        self.labels = data["y"]
+
+    def __getitem__(self, index):
+        x = self.images[index]
+        x = Image.fromarray(x)
+
+        if self.transform:
+            x = self.transform(x)
+
+        y = self.labels[index]
+        return x, y
+
+    def __len__(self):
+        return len(self.images)
+
+    def _check_exists(self) -> bool:
+        return os.path.exists(self.path)
diff --git a/src/detectors/data/wilds_ds.py b/src/detectors/data/wilds_ds.py
@@ -59,7 +59,6 @@
 def make_wilds_dataset(
     dataset_name, root, split="train", transform: Optional[Callable] = None, download=False, **kwargs
 ):
-
     dataset = wilds.get_dataset(dataset_name, root_dir=root, download=download)
     assert dataset is not None
     dataset = dataset.get_subset(split, transform=transform)