Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PIMO] compare to benchmark #2378

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,650 changes: 595 additions & 1,055 deletions notebooks/700_metrics/701e_aupimo_advanced_iv.ipynb

Large diffs are not rendered by default.

1,413 changes: 1,413 additions & 0 deletions notebooks/700_metrics/701f_aupimo_advanced_v.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions src/anomalib/metrics/pimo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

from .binary_classification_curve import ThresholdMethod
from .pimo import AUPIMO, PIMO, AUPIMOResult, PIMOResult
from .utils_benchmark import (
AUPIMO_BENCHMARK_DATASETS,
AUPIMO_BENCHMARK_MODELS,
aupimo_result_from_json_dict,
aupimo_result_to_json_dict,
download_aupimo_benchmark_scores,
get_aupimo_benchmark,
)

__all__ = [
# constants
Expand All @@ -20,4 +28,11 @@
"PIMO",
"AUPIMO",
"StatsOutliersPolicy",
# utils_benchmark
"AUPIMO_BENCHMARK_DATASETS",
"AUPIMO_BENCHMARK_MODELS",
"aupimo_result_from_json_dict",
"aupimo_result_to_json_dict",
"download_aupimo_benchmark_scores",
"get_aupimo_benchmark",
]
347 changes: 347 additions & 0 deletions src/anomalib/metrics/pimo/utils_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
"""Utility functions to compare AUPIMO scores with the benchmark results from AUPIMO's official repository.

Official repository: https://github.com/jpcbertoldo/aupimo
Benchmark data: https://github.com/jpcbertoldo/aupimo/tree/main/data/experiments/benchmark
"""

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import json
import logging
from concurrent.futures import ThreadPoolExecutor
from itertools import product
from pathlib import Path

import numpy as np
import pandas as pd
import requests
import torch
from pandas import DataFrame, Series

from .dataclasses import AUPIMOResult

logger = logging.getLogger(__name__)

AUPIMO_BENCHMARK_MODELS = {
"efficientad_wr101_m_ext",
"efficientad_wr101_s_ext",
"fastflow_cait_m48_448",
"fastflow_wr50",
"padim_r18",
"padim_wr50",
"patchcore_wr101",
"patchcore_wr50",
"pyramidflow_fnf_ext",
"pyramidflow_r18_ext",
"rd++_wr50_ext",
"simplenet_wr50_ext",
"uflow_ext",
}

AUPIMO_BENCHMARK_DATASETS = {
"mvtec/bottle",
"mvtec/cable",
"mvtec/capsule",
"mvtec/carpet",
"mvtec/grid",
"mvtec/hazelnut",
"mvtec/leather",
"mvtec/metal_nut",
"mvtec/pill",
"mvtec/screw",
"mvtec/tile",
"mvtec/toothbrush",
"mvtec/transistor",
"mvtec/wood",
"mvtec/zipper",
"visa/candle",
"visa/capsules",
"visa/cashew",
"visa/chewinggum",
"visa/fryum",
"visa/macaroni1",
"visa/macaroni2",
"visa/pcb1",
"visa/pcb2",
"visa/pcb3",
"visa/pcb4",
"visa/pipe_fryum",
}


def _validate_benchmark_model(model: str) -> None:
if model not in AUPIMO_BENCHMARK_MODELS:
msg = f"Model '{model}' not available. Choose one of {sorted(AUPIMO_BENCHMARK_MODELS)}."
raise ValueError(msg)


def _validate_benchmark_dataset(dataset: str) -> None:
if dataset not in AUPIMO_BENCHMARK_DATASETS:
msg = f"Dataset '{dataset}' not available. Choose one of {sorted(AUPIMO_BENCHMARK_DATASETS)}."
raise ValueError(msg)


def _get_benchmark_json_url(model: str, dataset: str) -> str:
"""Generate the URL for the JSON file of a specific model and dataset.

Args:
model: see `anomalib.metrics.pimo.AUPIMO_BENCHMARK_MODELS`
dataset: "collection/category", see `anomalib.metrics.pimo.AUPIMO_BENCHMARK_DATASETS`

Returns:
The URL for the JSON file of the model and dataset in the benchmark from AUPIMO's official repository.
"""
root_url = "https://raw.githubusercontent.com/jpcbertoldo/aupimo/refs/heads/main/data/experiments/benchmark"
_validate_benchmark_model(model)
_validate_benchmark_dataset(dataset)
return f"{root_url}/{model}/{dataset}/aupimo/aupimos.json"


def _download_benchmark_json(url: str) -> dict:
"""Download the JSON content from an URL."""
request = requests.get(url, timeout=10)
return json.loads(request.text)


def aupimo_result_from_json_dict(payload: dict) -> AUPIMOResult:
"""Convert the dictionary from a JSON payload to an AUPIMOResult dataclass.

Args:
payload: The JSON from the benchmark results:
{
"fpr_lower_bound": float,
"fpr_upper_bound": float,
"num_thresholds": int | None, # or "num_threshs"
"thresh_lower_bound": float,
"thresh_upper_bound": float,
"aupimos": list[float],
}

Returns:
An `anomalib.metrics.pimo.AUPIMOResult` dataclass.
"""
if not isinstance(payload, dict):
msg = f"Invalid payload. Must be a dictionary. Got {type(payload)}."
raise TypeError(msg)
try:
# `num_threshs` vs `num_thresholds` is an inconsistency with an older version of the JSON file
num_thresholds: int | None = payload["num_threshs"] if "num_threshs" in payload else payload["num_thresholds"]
return AUPIMOResult(
fpr_lower_bound=float(payload["fpr_lower_bound"]),
fpr_upper_bound=float(payload["fpr_upper_bound"]),
num_thresholds=num_thresholds if num_thresholds is None else int(num_thresholds),
thresh_lower_bound=float(payload["thresh_lower_bound"]),
thresh_upper_bound=float(payload["thresh_upper_bound"]),
aupimos=torch.tensor(payload["aupimos"], dtype=torch.float64),
)

except KeyError as ex:
msg = f"Invalid payload. Missing key {ex}."
raise ValueError(msg) from ex

except (TypeError, ValueError) as ex:
msg = f"Invalid payload. Cause: {ex}."
raise ValueError(msg) from ex


def aupimo_result_to_json_dict(
aupimo_result: AUPIMOResult,
paths: list[str | Path] | None = None,
) -> dict:
"""Convert the AUPIMOResult dataclass to a dictionary for JSON serialization.

Args:
aupimo_result: The AUPIMO scores from the benchmark results.
paths: The paths of the images used to compute the AUPIMO scores. Optional.

Returns:
A dictionary with the AUPIMO scores and the paths.
"""
payload = {
"fpr_lower_bound": aupimo_result.fpr_lower_bound,
"fpr_upper_bound": aupimo_result.fpr_upper_bound,
"num_thresholds": aupimo_result.num_thresholds,
"thresh_lower_bound": aupimo_result.thresh_lower_bound,
"thresh_upper_bound": aupimo_result.thresh_upper_bound,
"aupimos": aupimo_result.aupimos.tolist(),
}
if paths is not None:
if len(paths) != aupimo_result.aupimos.shape[0]:
msg = (
"Invalid paths. It must have the same length as the AUPIMO scores. "
f"Got {len(paths)} paths and {aupimo_result.aupimos.shape[0]} scores."
)
raise ValueError(msg)
# make sure the paths are strings, not pathlib.Path objects
payload["paths"] = [str(p) for p in paths]
return payload


def _download_aupimo_benchmark_scores(model: str, dataset: str) -> tuple[dict, AUPIMOResult]:
"""Get the benchmark AUPIMO scores for a specific model and dataset from AUPIMO's official repository.

Args:
model: see `anomalib.metrics.pimo.AUPIMO_BENCHMARK_MODELS`
dataset: "collection/category", see `anomalib.metrics.pimo.AUPIMO_BENCHMARK_DATASETS`

Returns:
(dict, AUPIMOResult): A tuple with the JSON payload and the AUPIMO scores.
dict: The unserialized JSON from the benchmark results. See `aupimo_result_from_json_dict`.
AUPIMOResult: The AUPIMO scores in dataclass format. See `anomalib.metrics.pimo.AUPIMOResult`.
"""
logger.debug(f"Loading benchmark results for {model=} {dataset=}")
url = _get_benchmark_json_url(model, dataset)
logger.debug(f"Dowloading JSON from {url=}")
payload = _download_benchmark_json(url)
logger.debug("Converting json payload to dataclass")
aupimo_result = aupimo_result_from_json_dict(payload)
logger.debug(f"Done loading benchmark results for {model=} {dataset=}")
return payload, aupimo_result


def _download_aupimo_benchmark_scores_multithreaded(
model: str,
dataset: str,
) -> tuple[tuple[str, str], tuple[dict, AUPIMOResult]]:
"""Do the same as `_download_aupimo_benchmark_scores` but return the job's arguments as well."""
return (model, dataset), _download_aupimo_benchmark_scores(model, dataset)


def download_aupimo_benchmark_scores(
model: str | None,
dataset: str | None,
avoid_multithread_download: bool = False,
) -> dict[tuple[str, str], tuple[dict, AUPIMOResult]]:
"""Dowload AUPIMO scores AUPIMO's paper benchmark (stored in the official repository).

If `model` is None, all models are considered.
If `dataset` is None, all datasets are considered.
If both `model` and `dataset` are None, all combinations of models and datasets are considered.

Official repository: https://github.com/jpcbertoldo/aupimo
Benchmark data: https://github.com/jpcbertoldo/aupimo/tree/main/data/experiments/benchmark

Args:
model: The model name. Available models: `anomalib.metrics.pimo.AUPIMO_BENCHMARK_MODELS`. If None, all models.
dataset: The "collection/category", where 'collection' is either 'mvtec' or 'visa', and 'category' is
the name of the dataset within the collection. Lowercase, words split by '_' (e.g. 'metal_nut').
Available datasets: `anomalib.metrics.pimo.AUPIMO_BENCHMARK_DATASETS`. If None, all datasets.
avoid_multithread_download: Multi-threaded download is used by default when downloading multiple files.
Set this to `True` to force single-threaded download.

Returns:
dict[(model, dataset), (dict, AUPIMOResult)]: dictionary of results.
key: (model, dataset) pair, e.g. ('efficientad_wr101_m_ext', 'mvtec/bottle')
value: tuple with the JSON payload dictionary and the AUPIMO scores (dataclass).
dict: The unserialized JSON from the benchmark results. See `aupimo_result_from_json_dict`.
AUPIMOResult: The AUPIMO scores in dataclass format. See `anomalib.metrics.pimo.AUPIMOResult`.
"""
if model is None:
models = sorted(AUPIMO_BENCHMARK_MODELS)
else:
_validate_benchmark_model(model)
models = [model]

if dataset is None:
datasets = sorted(AUPIMO_BENCHMARK_DATASETS)
else:
_validate_benchmark_dataset(dataset)
datasets = [dataset]

args = list(product(models, datasets))
logger.debug(f"Downloading benchmark results for {len(args)} (model, dataset) pairs")

if len(args) == 1:
logger.debug("Using single-threaded download.")
return {args[0]: _download_aupimo_benchmark_scores(models[0], datasets[0])}

if avoid_multithread_download:
logger.debug(f"Using single-threaded download due to {avoid_multithread_download=}")
results = {}
for model_, dataset_ in args:
results[model_, dataset_] = _download_aupimo_benchmark_scores(model_, dataset_)
return results

logger.debug("Using multi-threaded download.")
models, datasets = list(zip(*args, strict=True)) # type: ignore # noqa: PGH003
with ThreadPoolExecutor(thread_name_prefix="download_from_aupimo_benchmark_") as executor:
results = executor.map(_download_aupimo_benchmark_scores_multithreaded, models, datasets) # type: ignore # noqa: PGH003
return dict(results)


def get_aupimo_benchmark(
model: str | None,
dataset: str | None,
avoid_multithread_download: bool = False,
) -> tuple[DataFrame, DataFrame]:
"""Dowload results from AUPIMO's paper benchmark (stored in the official repository) and format in DataFrames.

If `model` is None, all models are considered.
If `dataset` is None, all datasets are considered.
If both `model` and `dataset` are None, all combinations of models and datasets are considered.

Official repository: https://github.com/jpcbertoldo/aupimo
Benchmark data: https://github.com/jpcbertoldo/aupimo/tree/main/data/experiments/benchmark

Args:
model: The model name. Available models: `anomalib.metrics.pimo.AUPIMO_BENCHMARK_MODELS`. If None, all models.
dataset: The "collection/category", where 'collection' is either 'mvtec' or 'visa', and 'category' is
the name of the dataset within the collection. Lowercase, words split by '_' (e.g. 'metal_nut').
Available datasets: `anomalib.metrics.pimo.AUPIMO_BENCHMARK_DATASETS`. If None, all datasets.
avoid_multithread_download: Multi-threaded download is used by default when downloading multiple files.
Set this to `True` to force single-threaded download.

Returns:
(data_per_set, data_per_image): A tuple with two DataFrames.
data_per_set: for example, at which anomaly scores of are the integration FPRn bounds met?
data_per_image: AUPIMO scores of each image and the path to the input image.
"""
# don't validate model and dataset here, it's done in `download_aupimo_benchmark_scores`
results = download_aupimo_benchmark_scores(
model=model,
dataset=dataset,
avoid_multithread_download=avoid_multithread_download,
)

data = pd.DataFrame.from_records([
{
"model": model,
"dataset": dataset,
# per-set data
"fpr_lower_bound": aupimo_result.fpr_lower_bound,
"fpr_upper_bound": aupimo_result.fpr_upper_bound,
"num_thresholds": aupimo_result.num_thresholds,
"thresh_lower_bound": aupimo_result.thresh_lower_bound,
"thresh_upper_bound": aupimo_result.thresh_upper_bound,
# per-image data
"sample_index": list(range(num_samples := len(aupimo_result.aupimos))),
"aupimo": aupimo_result.aupimos.tolist(),
"path": paths if (paths := json_dict["paths"]) is not None else [None] * num_samples,
}
for (model, dataset), (json_dict, aupimo_result) in results.items()
])
data["model"] = data["model"].astype("category")
data["dataset"] = data["dataset"].astype("category")

data_per_set = (
data.drop(columns=["sample_index", "aupimo", "path"]).sort_values(["model", "dataset"]).reset_index(drop=True)
)

data_per_image = (
data[["model", "dataset", "sample_index", "aupimo", "path"]]
.explode(["sample_index", "aupimo", "path"])
.sort_values(["model", "dataset", "sample_index"])
.reset_index(drop=True)
.astype({"sample_index": int, "aupimo": float, "path": "string"})
)

def get_average_aupimo(row: Series) -> float:
model = row["model"] # noqa: F841
dataset = row["dataset"] # noqa: F841
aupimos = data_per_image.query("model == @model and dataset == @dataset")["aupimo"]
return np.nanmean(aupimos)

data_per_set["aupimo_mean"] = data_per_set.apply(get_average_aupimo, axis=1)

return data_per_set, data_per_image
Loading