From 5a11d01497ca42f849a779c99a12d6b5caf6bf73 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 28 Apr 2025 13:27:17 +0100
Subject: [PATCH 01/72] Notebook to visualise detections only

---
 MANIFEST.in                      |   4 +
 notebook_visualise_detections.py | 183 +++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 notebook_visualise_detections.py

diff --git a/MANIFEST.in b/MANIFEST.in
index d5fb477d..4388fd3b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -13,3 +13,7 @@ recursive-include docs *.md *.rst *.py
 # Include json schemas
 recursive-include ethology/annotations/json_schemas/schemas *.json
 recursive-include ethology/annotations/json_schemas/schemas *.md
+
+
+# Temporarily include notebooks
+include *.py
diff --git a/notebook_visualise_detections.py b/notebook_visualise_detections.py
new file mode 100644
index 00000000..f5f6f4bd
--- /dev/null
+++ b/notebook_visualise_detections.py
@@ -0,0 +1,183 @@
+"""Run detection only.
+
+A script to run detection only and export them in a format that
+can be loaded in movement napari widget
+"""
+
+# %%
+from datetime import datetime
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2 as transforms
+import yaml
+from movement.io import load_poses, save_poses
+from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data
+video_path = Path(
+    "/home/sminano/swc/project_ethology/tap_models_crabs/input/04.09.2023-04-Right_RE_test.mp4"
+)
+
+trained_model_path = Path(
+    "/home/sminano/swc/project_ethology/run_slurm_5313275_0/ml-runs_317777717624044570_40b1688a76d94bd08175cb380d0a6e0e_checkpoints_last.ckpt"
+)
+trained_model_config_path = Path(
+    "/home/sminano/swc/project_ethology/run_slurm_5313275_0/01_config_all_data_augmentation.yaml"
+)
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Helper functions
+
+
+def open_video(video_path: str | Path) -> cv2.VideoCapture:
+    """Open video file."""
+    video_object = cv2.VideoCapture(video_path)
+    if not video_object.isOpened():
+        raise Exception("Error opening video file")
+    return video_object
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Load model
+
+# Read config
+with open(trained_model_config_path) as f:
+    trained_model_config = yaml.safe_load(f)
+
+# Load structure
+model = fasterrcnn_resnet50_fpn_v2(
+    weights=None,
+    weights_backbone=None,
+    num_classes=trained_model_config["num_classes"],
+)
+
+# Read state dict
+state_dict = torch.load(trained_model_path)
+state_dict_model = {
+    k.lstrip("model."): v
+    for k, v in state_dict["state_dict"].items()
+    if k.startswith("model.")
+}
+
+# Load weights into model and set to evaluation mode
+model.load_state_dict(state_dict_model)
+model.eval()
+model.to(device)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define transforms to apply to input frames
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run detection only
+
+# Initialise dict to store tracked bboxes
+detections_all_frames = {}
+
+# Loop over frames
+frame_idx = 0
+input_video_object = open_video(video_path)
+total_n_frames = int(input_video_object.get(cv2.CAP_PROP_FRAME_COUNT))
+
+
+while input_video_object.isOpened():
+    # Read frame
+    ret, frame = input_video_object.read()
+
+    # If ret is False, it means we have reached the end of the video
+    if not ret:
+        break
+
+    # Apply transforms to frame and place tensor on device
+    image_tensor = inference_transforms(frame).to(device)[None]
+
+    # Run detection
+    with torch.no_grad():
+        # use [0] to select the one image in the batch
+        # Returns: dictionary with data of the predicted bounding boxes.
+        # The keys are: "boxes", "scores", and "labels". The labels
+        # refer to the class of the object detected, and not its ID.
+        detections_dict = model(image_tensor)[0]
+
+    # Add to dict
+    bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
+    bbox_confidences = detections_dict["scores"].cpu().numpy()
+    bbox_centroids = (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2
+
+    detections_all_frames[frame_idx] = {
+        "bbox_centroids": bbox_centroids,  # detection_idx, x, y
+        "bbox_confidences": bbox_confidences,  # detection_idx, confidence
+    }
+
+    # Update frame index
+    frame_idx += 1
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format detections as a movement dataset
+
+max_detections_per_frame = max(
+    [
+        dets["bbox_centroids"].shape[0]
+        for dets in detections_all_frames.values()
+    ]
+)
+n_keypoints = 1
+
+# Pad arrays with nans
+position_array = np.full(
+    (total_n_frames, 2, n_keypoints, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_space, n_keypoints, n_individuals)
+confidence_array = np.full(
+    (total_n_frames, n_keypoints, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_keypoints, n_individuals)
+for frame_idx, dets in detections_all_frames.items():
+    position_array[frame_idx, :, :, : dets["bbox_centroids"].shape[0]] = (
+        np.transpose(dets["bbox_centroids"][None], (-1, 0, 1))
+    )
+    confidence_array[frame_idx, :, : dets["bbox_centroids"].shape[0]] = dets[
+        "bbox_confidences"
+    ][None, None]
+
+
+# %%
+ds = load_poses.from_numpy(
+    position_array=position_array,
+    confidence_array=confidence_array,
+    individual_names=[
+        f"untracked_{i}" for i in range(max_detections_per_frame)
+    ],
+    keypoint_names=["centroid"],
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Export movement dataset
+
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+save_poses.to_sleap_analysis_file(
+    ds, f"detections_untracked_{video_path.stem}_{timestamp}.h5"
+)
+
+# %%

From 56c17525f24d36e931f48da287366d5c24d6e36d Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 2 May 2025 19:39:53 +0100
Subject: [PATCH 02/72] Add boxmot dependency

---
 pyproject.toml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3fb07d7b..3d7834a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,9 +18,7 @@ classifiers = [
   "Operating System :: OS Independent",
   "License :: OSI Approved :: BSD License",
 ]
-dependencies = [
-  "movement",
-]
+dependencies = ["movement", "boxmot"]
 
 [project.urls]
 "Homepage" = "https://github.com/neuroinformatics-unit/ethology"

From 6985f74e36b754766bc0545e4c70f66993fbee10 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 26 Jun 2025 11:56:38 +0100
Subject: [PATCH 03/72] Rename and add new notebook

---
 notebook_run_detection_on_dataset.py                        | 5 +++++
 ...lise_detections.py => notebook_run_detection_on_video.py | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 notebook_run_detection_on_dataset.py
 rename notebook_visualise_detections.py => notebook_run_detection_on_video.py (96%)

diff --git a/notebook_run_detection_on_dataset.py b/notebook_run_detection_on_dataset.py
new file mode 100644
index 00000000..3f1d92ac
--- /dev/null
+++ b/notebook_run_detection_on_dataset.py
@@ -0,0 +1,5 @@
+"""Run detection on a Pytorch dataset and export results as a movement dataset.
+
+A script to run detection only on an input video and export the results
+in a format that can be loaded in movement napari widget.
+"""
diff --git a/notebook_visualise_detections.py b/notebook_run_detection_on_video.py
similarity index 96%
rename from notebook_visualise_detections.py
rename to notebook_run_detection_on_video.py
index f5f6f4bd..43403f80 100644
--- a/notebook_visualise_detections.py
+++ b/notebook_run_detection_on_video.py
@@ -1,7 +1,7 @@
-"""Run detection only.
+"""Run detection on video and export results as a movement dataset.
 
-A script to run detection only and export them in a format that
-can be loaded in movement napari widget
+A script to run detection only on an input video and export the results
+in a format that can be loaded in movement napari widget.
 """
 
 # %%

From fb52d3c6bd9c947a1341734c46c198a8075dfa60 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 26 Jun 2025 18:15:40 +0100
Subject: [PATCH 04/72] Add dependencies for metrics computation

---
 pyproject.toml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d7834a2..db4af857 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,13 @@ classifiers = [
   "Operating System :: OS Independent",
   "License :: OSI Approved :: BSD License",
 ]
-dependencies = ["movement", "boxmot"]
+dependencies = [
+  "movement",
+  "boxmot",
+  "mlflow-skinny",
+  "pycocotools",
+  "torchmetrics"
+]
 
 [project.urls]
 "Homepage" = "https://github.com/neuroinformatics-unit/ethology"

From 0e3194cca47f694d1ea953be8a7aeaac51c905ed Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 26 Jun 2025 18:16:19 +0100
Subject: [PATCH 05/72] Notebook to run and evaluate detector on a dataset
 (WIP)

---
 notebook_run_detection_on_dataset.py | 399 ++++++++++++++++++++++++++-
 1 file changed, 397 insertions(+), 2 deletions(-)

diff --git a/notebook_run_detection_on_dataset.py b/notebook_run_detection_on_dataset.py
index 3f1d92ac..eede1ce4 100644
--- a/notebook_run_detection_on_dataset.py
+++ b/notebook_run_detection_on_dataset.py
@@ -1,5 +1,400 @@
 """Run detection on a Pytorch dataset and export results as a movement dataset.
 
-A script to run detection only on an input video and export the results
-in a format that can be loaded in movement napari widget.
+A script to run detection only (no tracking) on a Pytorch dataset and
+export the results in a format that can be loaded in movement napari widget.
 """
+
+# %%
+import ast
+from datetime import datetime
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2 as transforms
+from mlflow.tracking import MlflowClient
+from movement.io import load_poses, save_poses
+from torch.utils.data import random_split
+from torchmetrics.detection import MeanAveragePrecision
+from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+
+trained_model_path = Path(
+    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
+)
+
+trained_model_mlflow_params_path = Path(
+    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/params"
+)  # for config
+
+
+# to save output frames and detections
+output_parent_dir = Path("/home/sminano/swc/project_ethology")
+
+flag_save_frames = False
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Retrieve model config and CLI args from mlflow
+
+
+def read_mlflow_params(
+    trained_model_path: str, tracking_uri: str = None
+) -> dict:
+    """Read parameters for a specific MLflow run."""
+    # Create MLflow client
+    mlruns_path = str(Path(trained_model_path).parents[3])
+    client = MlflowClient(tracking_uri=mlruns_path)
+
+    # Get the run
+    runID = Path(trained_model_path).parents[1].stem
+    run = client.get_run(runID)
+
+    # Access parameters
+    params = run.data.params
+    params["run_name"] = run.info.run_name
+
+    return params
+
+
+mlflow_params = read_mlflow_params(trained_model_path)
+config = {
+    k.removeprefix("config/"): ast.literal_eval(v)
+    for k, v in mlflow_params.items()
+    if k.startswith("config/")
+}
+
+
+def safe_eval_string(s):
+    """Try to evaluate a string as a literal, otherwise return as-is."""
+    try:
+        return ast.literal_eval(s)
+    except (ValueError, SyntaxError):
+        # return as-is if not a valid literal
+        return s
+
+
+cli_args = {
+    k.removeprefix("cli_args/"): safe_eval_string(v)
+    for k, v in mlflow_params.items()
+    if k.startswith("cli_args/")
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Load model
+
+# Load structure
+model = fasterrcnn_resnet50_fpn_v2(
+    weights=None,
+    weights_backbone=None,
+    num_classes=config["num_classes"],
+)
+
+# Read state dict
+state_dict = torch.load(trained_model_path)
+state_dict_model = {
+    k.lstrip("model."): v
+    for k, v in state_dict["state_dict"].items()
+    if k.startswith("model.")
+}
+
+# Load weights into model and set to evaluation mode
+model.load_state_dict(state_dict_model)
+model.eval()
+model.to(device)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define transforms to apply to input frames
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Build Pytorch dataset
+seed_n = cli_args["seed_n"]
+annotations_filename = Path(cli_args["annotation_files"][0]).name
+
+# create "default" COCO dataset
+dataset_coco = CocoDetection(
+    Path(dataset_dir) / "frames",
+    Path(dataset_dir) / "annotations" / annotations_filename,
+    transforms=inference_transforms,
+)
+
+# wrap dataset for transforms v2
+dataset_transformed = wrap_dataset_for_transforms_v2(dataset_coco)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Split dataset
+# def _collate_fn(self, batch: tuple) -> tuple:
+#     """Collate function used for dataloaders.
+
+#     A custom function is needed for detection
+#     because the number of bounding boxes varies
+#     between images of the same batch.
+#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
+
+#     Parameters
+#     ----------
+#     batch : tuple
+#         a tuple of 2 tuples, the first one holding all images in the batch,
+#         and the second one holding the corresponding annotations.
+
+#     Returns
+#     -------
+#     tuple
+#         a tuple of length = batch size, made up of (image, annotations)
+#         tuples.
+
+#     """
+#     return tuple(zip(*batch))
+
+
+# Split data into train and test-val sets
+rng_train_split = torch.Generator().manual_seed(seed_n)
+rng_val_split = torch.Generator().manual_seed(seed_n)
+
+train_dataset, test_val_dataset = random_split(
+    dataset_transformed,
+    [config["train_fraction"], 1 - config["train_fraction"]],
+    generator=rng_train_split,
+)
+
+# Split test/val sets from the remainder
+test_dataset, val_dataset = random_split(
+    test_val_dataset,
+    [
+        1 - config["val_over_test_fraction"],
+        config["val_over_test_fraction"],
+    ],
+    generator=rng_val_split,
+)
+
+print(f"Seed: {seed_n}")
+print(f"Number of training samples: {len(train_dataset)}")
+print(f"Number of validation samples: {len(val_dataset)}")
+print(f"Number of test samples: {len(test_dataset)}")
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run detection on validation set
+
+# TODO: use dataloader for efficiency?
+detections_per_validation_sample = {}
+annotations_per_validation_sample = {}
+
+# # initialise metric
+# metric = MeanAveragePrecision(
+#     box_format="xyxy",
+#     iou_type="bbox",
+#     iou_thresholds=None,#[0.1],  # If set to None [0.5,...,0.95] with step 0.05
+#     rec_thresholds=None,  # If set to None [0,...,1] with step 0.01
+#     max_detection_thresholds=[1, 100, 1000],
+#     extended_summary=True,
+#     average="micro",  # macro=average per class first, should be same for me?
+# )
+
+metric_per_frame = MeanAveragePrecision(
+    box_format="xyxy",
+    iou_type="bbox",
+    iou_thresholds=[0.1], # 0.5  # If set to None [0.5,...,0.95] with step 0.05
+    rec_thresholds=None,  # If set to None [0,...,1] with step 0.01 -- these are the interpolation points
+    max_detection_thresholds=[10, 100, 1000],
+    extended_summary=True,
+    average="micro",  # macro=average per class first, should be same for me?
+)
+
+# create output directory if it doesn't exist
+if flag_save_frames:
+    output_dir = output_parent_dir / f"{dataset_dir.name}_val_seed_n_{seed_n}"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+
+recall_per_sample = []
+precision_per_sample = []
+map_per_sample = []
+
+for val_idx, (image, annotations) in enumerate(val_dataset):
+    # Apply transforms to frame and place tensor on device
+    image_tensor = inference_transforms(image).to(device)[None]
+
+    # Put annotations in same device as image
+    annotations["boxes"] = annotations["boxes"].to(device)
+    annotations["labels"] = annotations["labels"].to(device)
+
+    # Run detection
+    with torch.no_grad():
+        # use [0] to select the one image in the batch
+        # Returns: dictionary with data of the predicted bounding boxes.
+        # The keys are: "boxes", "scores", and "labels". The labels
+        # refer to the class of the object detected, and not its ID.
+        detections_dict = model(image_tensor)[0]
+
+        # add to metric
+        # metric.update([detections_dict], [annotations])
+
+        # add to metric per frame
+        metric_per_frame.reset()
+        metric_per_frame.update([detections_dict], [annotations])
+        metrics_one_frame = metric_per_frame.compute()
+
+        # map is area under P-C curve, averaged over all IOU thresholds?
+        recall_one_frame = metrics_one_frame["recall"][0, 0, 0, -1].item()
+
+        precision_values = metrics_one_frame["precision"][
+            0, :, 0, 0, -1
+        ].numpy()  # first IOU threshold, first class, first area (?), max detections = 1000
+        idcs_precision_non_zero = np.nonzero(precision_values)[0]
+        precision_one_frame = precision_values[idcs_precision_non_zero[-1]]
+
+        map_one_frame = metrics_one_frame["map"].item()
+
+        # add to list
+        recall_per_sample.append(recall_one_frame)
+        precision_per_sample.append(precision_one_frame)
+        map_per_sample.append(map_one_frame)
+        
+
+        print(
+            f"Validation sample {val_idx}, "
+            f"mAP: {map_one_frame}, "  # area under P-C curve, averaged over all IOU thresholds
+            f"Precision: {precision_one_frame}, "
+            f"Recall: {recall_one_frame}"
+        )
+        # recall for first IOU threshold, first class, first area (?),
+        # max detections = 1000
+
+    # Add to dict
+    bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
+    bbox_confidences = detections_dict["scores"].cpu().numpy()
+    bbox_centroids = (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2
+
+    detections_per_validation_sample[val_idx] = {
+        "bbox_centroids": bbox_centroids,  # detection_idx, x, y
+        "bbox_confidences": bbox_confidences,  # detection_idx, confidence
+    }
+
+    # add to dict
+    # annotations_per_validation_sample[val_idx] = annotations
+
+    # Save image
+    if flag_save_frames:
+        image_path = output_dir / f"frame_val_idx_{val_idx:06d}.png"
+        image_array = (image.permute(1, 2, 0).numpy() * 255).astype(
+            np.uint8
+        )  # (C, H, W) -> (H, W, C)
+        cv2.imwrite(
+            image_path, cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
+        )  # cv2 assumes BGR
+
+
+# metrics = metric.compute()
+# print(metrics["map"])
+
+print(f"Mean recall: {np.mean(recall_per_sample)}") 
+# 0.8494677009613534 @ IOU=0.1
+# 0.8033303880293905 @ IOU=0.5
+print(f"Mean precision: {np.mean(precision_per_sample)}")  
+# 0.9767496450305635 @ IOU=0.1
+# 0.929829445017168 @ IOU=0.5
+
+# %%
+# Compute metrics
+# metrics = metric.compute()
+
+# print(metrics["map"])
+
+# %%
+# Show iou > threshold for detections in first image
+# as a matrix with n_rows = n_detections, n_cols = n_gt_boxes
+
+# import matplotlib.pyplot as plt
+
+# # first image, first class
+# plt.imshow(metrics['ious'][(np.int64(0), np.int64(1))] > 0.5)
+
+# # 30th image, first class
+# plt.imshow(metrics['ious'][(np.int64(30), np.int64(1))] > 0.5)
+
+
+# %%
+# P-R curve for first IOU threshold, first class, first area (?), max detections = 1000
+import matplotlib.pyplot as plt
+
+plt.scatter(
+    x=np.arange(0, 1.01, 0.01),  # recall
+    y=metrics_one_frame["precision"][0, :, 0, 0, -1],  # precision
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format detections as a movement dataset
+
+# Get params for array dimensions
+max_detections_per_frame = max(
+    [
+        dets["bbox_centroids"].shape[0]
+        for dets in detections_per_validation_sample.values()
+    ]
+)
+n_keypoints = 1
+total_n_frames = len(val_dataset)
+
+# Initialise position and confidence arrays
+position_array = np.full(
+    (total_n_frames, 2, n_keypoints, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_space, n_keypoints, n_individuals)
+confidence_array = np.full(
+    (total_n_frames, n_keypoints, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_keypoints, n_individuals)
+
+# Fill in values
+for frame_idx, dets in detections_per_validation_sample.items():
+    position_array[frame_idx, :, :, : dets["bbox_centroids"].shape[0]] = (
+        np.transpose(dets["bbox_centroids"][None], (-1, 0, 1))
+    )
+    confidence_array[frame_idx, :, : dets["bbox_centroids"].shape[0]] = dets[
+        "bbox_confidences"
+    ][None, None]
+
+# format as movement dataset
+ds = load_poses.from_numpy(
+    position_array=position_array,
+    confidence_array=confidence_array,
+    individual_names=[
+        f"untracked_{i}" for i in range(max_detections_per_frame)
+    ],
+    keypoint_names=["centroid"],
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Export movement dataset as .slp file
+# in the future: export as VIA tracks file (after PR merged!)
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+filename = Path(
+    f"{mlflow_params['run_name']}_detections_val_set_{timestamp}.h5"
+)
+save_poses.to_sleap_analysis_file(ds, output_parent_dir / filename)
+
+# %%
+
+# %%

From 1c3fbe9e407f49ade11840f8fcd41651c592125e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Jun 2025 17:18:06 +0000
Subject: [PATCH 06/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 notebook_run_detection_on_dataset.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/notebook_run_detection_on_dataset.py b/notebook_run_detection_on_dataset.py
index eede1ce4..5ac51feb 100644
--- a/notebook_run_detection_on_dataset.py
+++ b/notebook_run_detection_on_dataset.py
@@ -215,7 +215,9 @@ def safe_eval_string(s):
 metric_per_frame = MeanAveragePrecision(
     box_format="xyxy",
     iou_type="bbox",
-    iou_thresholds=[0.1], # 0.5  # If set to None [0.5,...,0.95] with step 0.05
+    iou_thresholds=[
+        0.1
+    ],  # 0.5  # If set to None [0.5,...,0.95] with step 0.05
     rec_thresholds=None,  # If set to None [0,...,1] with step 0.01 -- these are the interpolation points
     max_detection_thresholds=[10, 100, 1000],
     extended_summary=True,
@@ -259,7 +261,9 @@ def safe_eval_string(s):
         # map is area under P-C curve, averaged over all IOU thresholds?
         recall_one_frame = metrics_one_frame["recall"][0, 0, 0, -1].item()
 
-        precision_values = metrics_one_frame["precision"][
+        precision_values = metrics_one_frame[
+            "precision"
+        ][
             0, :, 0, 0, -1
         ].numpy()  # first IOU threshold, first class, first area (?), max detections = 1000
         idcs_precision_non_zero = np.nonzero(precision_values)[0]
@@ -271,7 +275,6 @@ def safe_eval_string(s):
         recall_per_sample.append(recall_one_frame)
         precision_per_sample.append(precision_one_frame)
         map_per_sample.append(map_one_frame)
-        
 
         print(
             f"Validation sample {val_idx}, "
@@ -309,10 +312,10 @@ def safe_eval_string(s):
 # metrics = metric.compute()
 # print(metrics["map"])
 
-print(f"Mean recall: {np.mean(recall_per_sample)}") 
+print(f"Mean recall: {np.mean(recall_per_sample)}")
 # 0.8494677009613534 @ IOU=0.1
 # 0.8033303880293905 @ IOU=0.5
-print(f"Mean precision: {np.mean(precision_per_sample)}")  
+print(f"Mean precision: {np.mean(precision_per_sample)}")
 # 0.9767496450305635 @ IOU=0.1
 # 0.929829445017168 @ IOU=0.5
 

From 0e6fe5c49a97dd5f82d4cffffe5d9335dbaa3c98 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 30 Jun 2025 16:19:59 +0100
Subject: [PATCH 07/72] exploring pycoco tools

---
 notebook_run_detection_on_dataset.py | 101 ++++-----------------------
 1 file changed, 13 insertions(+), 88 deletions(-)

diff --git a/notebook_run_detection_on_dataset.py b/notebook_run_detection_on_dataset.py
index eede1ce4..790287ec 100644
--- a/notebook_run_detection_on_dataset.py
+++ b/notebook_run_detection_on_dataset.py
@@ -9,14 +9,12 @@
 from datetime import datetime
 from pathlib import Path
 
-import cv2
 import numpy as np
 import torch
 import torchvision.transforms.v2 as transforms
 from mlflow.tracking import MlflowClient
 from movement.io import load_poses, save_poses
 from torch.utils.data import random_split
-from torchmetrics.detection import MeanAveragePrecision
 from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
 from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
 
@@ -48,6 +46,8 @@
     else "cpu"
 )
 
+print(f"Using device: {device}")
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Retrieve model config and CLI args from mlflow
 
@@ -126,6 +126,8 @@ def safe_eval_string(s):
     ]
 )
 
+# Sanitize bounding boxes?
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Build Pytorch dataset
 seed_n = cli_args["seed_n"]
@@ -199,38 +201,6 @@ def safe_eval_string(s):
 
 # TODO: use dataloader for efficiency?
 detections_per_validation_sample = {}
-annotations_per_validation_sample = {}
-
-# # initialise metric
-# metric = MeanAveragePrecision(
-#     box_format="xyxy",
-#     iou_type="bbox",
-#     iou_thresholds=None,#[0.1],  # If set to None [0.5,...,0.95] with step 0.05
-#     rec_thresholds=None,  # If set to None [0,...,1] with step 0.01
-#     max_detection_thresholds=[1, 100, 1000],
-#     extended_summary=True,
-#     average="micro",  # macro=average per class first, should be same for me?
-# )
-
-metric_per_frame = MeanAveragePrecision(
-    box_format="xyxy",
-    iou_type="bbox",
-    iou_thresholds=[0.1], # 0.5  # If set to None [0.5,...,0.95] with step 0.05
-    rec_thresholds=None,  # If set to None [0,...,1] with step 0.01 -- these are the interpolation points
-    max_detection_thresholds=[10, 100, 1000],
-    extended_summary=True,
-    average="micro",  # macro=average per class first, should be same for me?
-)
-
-# create output directory if it doesn't exist
-if flag_save_frames:
-    output_dir = output_parent_dir / f"{dataset_dir.name}_val_seed_n_{seed_n}"
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-
-recall_per_sample = []
-precision_per_sample = []
-map_per_sample = []
 
 for val_idx, (image, annotations) in enumerate(val_dataset):
     # Apply transforms to frame and place tensor on device
@@ -246,41 +216,7 @@ def safe_eval_string(s):
         # Returns: dictionary with data of the predicted bounding boxes.
         # The keys are: "boxes", "scores", and "labels". The labels
         # refer to the class of the object detected, and not its ID.
-        detections_dict = model(image_tensor)[0]
-
-        # add to metric
-        # metric.update([detections_dict], [annotations])
-
-        # add to metric per frame
-        metric_per_frame.reset()
-        metric_per_frame.update([detections_dict], [annotations])
-        metrics_one_frame = metric_per_frame.compute()
-
-        # map is area under P-C curve, averaged over all IOU thresholds?
-        recall_one_frame = metrics_one_frame["recall"][0, 0, 0, -1].item()
-
-        precision_values = metrics_one_frame["precision"][
-            0, :, 0, 0, -1
-        ].numpy()  # first IOU threshold, first class, first area (?), max detections = 1000
-        idcs_precision_non_zero = np.nonzero(precision_values)[0]
-        precision_one_frame = precision_values[idcs_precision_non_zero[-1]]
-
-        map_one_frame = metrics_one_frame["map"].item()
-
-        # add to list
-        recall_per_sample.append(recall_one_frame)
-        precision_per_sample.append(precision_one_frame)
-        map_per_sample.append(map_one_frame)
-        
-
-        print(
-            f"Validation sample {val_idx}, "
-            f"mAP: {map_one_frame}, "  # area under P-C curve, averaged over all IOU thresholds
-            f"Precision: {precision_one_frame}, "
-            f"Recall: {recall_one_frame}"
-        )
-        # recall for first IOU threshold, first class, first area (?),
-        # max detections = 1000
+        detections_dict = model(image_tensor)[0]  # (n_detections, 4)
 
     # Add to dict
     bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
@@ -292,29 +228,18 @@ def safe_eval_string(s):
         "bbox_confidences": bbox_confidences,  # detection_idx, confidence
     }
 
-    # add to dict
-    # annotations_per_validation_sample[val_idx] = annotations
 
-    # Save image
-    if flag_save_frames:
-        image_path = output_dir / f"frame_val_idx_{val_idx:06d}.png"
-        image_array = (image.permute(1, 2, 0).numpy() * 255).astype(
-            np.uint8
-        )  # (C, H, W) -> (H, W, C)
-        cv2.imwrite(
-            image_path, cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
-        )  # cv2 assumes BGR
+# %%%%%%%%%%%%%%%%%%%%%%%
+# Export detections as COCO JSON
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate with pycocotools
+from pycocotools.coco import COCO
 
-# metrics = metric.compute()
-# print(metrics["map"])
+annType = "bbox"
+prefix = "instances"
 
-print(f"Mean recall: {np.mean(recall_per_sample)}") 
-# 0.8494677009613534 @ IOU=0.1
-# 0.8033303880293905 @ IOU=0.5
-print(f"Mean precision: {np.mean(precision_per_sample)}")  
-# 0.9767496450305635 @ IOU=0.1
-# 0.929829445017168 @ IOU=0.5
+cocoGt = COCO(str(dataset_dir / "annotations/VIA_JSON_combined_coco_gen.json"))
 
 # %%
 # Compute metrics

From 6f27465a2388645603566cfaeddc6b88222a54ca Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 1 Jul 2025 17:37:37 +0100
Subject: [PATCH 08/72] Explore formatting detections as xarray and exporting
 as COCO-annotations

---
 ethology/__init__.py                          |   6 +
 .../notebook_run_detection_on_dataset.py      | 369 ++++++++++++++++++
 2 files changed, 375 insertions(+)
 create mode 100644 notebooks/notebook_run_detection_on_dataset.py

diff --git a/ethology/__init__.py b/ethology/__init__.py
index ad7212ff..95a38c01 100644
--- a/ethology/__init__.py
+++ b/ethology/__init__.py
@@ -1,5 +1,11 @@
 from importlib.metadata import PackageNotFoundError, version
 
+import xarray as xr
+
+# Set xarray options
+# show collapsed attributes by default
+xr.set_options(display_expand_attrs=False)
+
 try:
     __version__ = version("ethology")
 except PackageNotFoundError:
diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
new file mode 100644
index 00000000..139a3a1a
--- /dev/null
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -0,0 +1,369 @@
+"""Run detection on a Pytorch dataset and export results as a movement dataset.
+
+A script to run detection only (no tracking) on a Pytorch dataset and
+export the results in a format that can be loaded in movement napari widget.
+"""
+
+# %%
+import ast
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from mlflow.tracking import MlflowClient
+from pycocotools.coco import COCO
+from torch.utils.data import random_split
+from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
+
+from ethology.annotations.io import save_bboxes
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+
+trained_model_path = Path(
+    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
+)
+
+trained_model_mlflow_params_path = Path(
+    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/params"
+)  # for config
+
+
+# to save output frames and detections
+output_parent_dir = Path("/home/sminano/swc/project_ethology")
+
+flag_save_frames = False
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+print(f"Using device: {device}")
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Retrieve model config and CLI args from mlflow
+
+
+def read_mlflow_params(
+    trained_model_path: str, tracking_uri: str = None
+) -> dict:
+    """Read parameters for a specific MLflow run."""
+    # Create MLflow client
+    mlruns_path = str(Path(trained_model_path).parents[3])
+    client = MlflowClient(tracking_uri=mlruns_path)
+
+    # Get the run
+    runID = Path(trained_model_path).parents[1].stem
+    run = client.get_run(runID)
+
+    # Access parameters
+    params = run.data.params
+    params["run_name"] = run.info.run_name
+
+    return params
+
+
+mlflow_params = read_mlflow_params(trained_model_path)
+config = {
+    k.removeprefix("config/"): ast.literal_eval(v)
+    for k, v in mlflow_params.items()
+    if k.startswith("config/")
+}
+
+
+def safe_eval_string(s):
+    """Try to evaluate a string as a literal, otherwise return as-is."""
+    try:
+        return ast.literal_eval(s)
+    except (ValueError, SyntaxError):
+        # return as-is if not a valid literal
+        return s
+
+
+cli_args = {
+    k.removeprefix("cli_args/"): safe_eval_string(v)
+    for k, v in mlflow_params.items()
+    if k.startswith("cli_args/")
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Load model
+
+# Load structure
+model = fasterrcnn_resnet50_fpn_v2(
+    weights=None,
+    weights_backbone=None,
+    num_classes=config["num_classes"],
+)
+
+# Read state dict
+state_dict = torch.load(trained_model_path)
+state_dict_model = {
+    k.lstrip("model."): v
+    for k, v in state_dict["state_dict"].items()
+    if k.startswith("model.")
+}
+
+# Load weights into model and set to evaluation mode
+model.load_state_dict(state_dict_model)
+model.eval()
+model.to(device)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define transforms to apply to input frames
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# Sanitize bounding boxes?
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Build Pytorch dataset
+seed_n = cli_args["seed_n"]
+annotations_filename = Path(cli_args["annotation_files"][0]).name
+
+# create "default" COCO dataset
+dataset_coco = CocoDetection(
+    Path(dataset_dir) / "frames",
+    Path(dataset_dir) / "annotations" / annotations_filename,
+    transforms=inference_transforms,
+)
+
+# wrap dataset for transforms v2
+dataset_transformed = wrap_dataset_for_transforms_v2(dataset_coco)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Split dataset
+# def _collate_fn(self, batch: tuple) -> tuple:
+#     """Collate function used for dataloaders.
+
+#     A custom function is needed for detection
+#     because the number of bounding boxes varies
+#     between images of the same batch.
+#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
+
+#     Parameters
+#     ----------
+#     batch : tuple
+#         a tuple of 2 tuples, the first one holding all images in the batch,
+#         and the second one holding the corresponding annotations.
+
+#     Returns
+#     -------
+#     tuple
+#         a tuple of length = batch size, made up of (image, annotations)
+#         tuples.
+
+#     """
+#     return tuple(zip(*batch))
+
+
+# Split data into train and test-val sets
+rng_train_split = torch.Generator().manual_seed(seed_n)
+rng_val_split = torch.Generator().manual_seed(seed_n)
+
+train_dataset, test_val_dataset = random_split(
+    dataset_transformed,
+    [config["train_fraction"], 1 - config["train_fraction"]],
+    generator=rng_train_split,
+)
+
+# Split test/val sets from the remainder
+test_dataset, val_dataset = random_split(
+    test_val_dataset,
+    [
+        1 - config["val_over_test_fraction"],
+        config["val_over_test_fraction"],
+    ],
+    generator=rng_val_split,
+)
+
+print(f"Seed: {seed_n}")
+print(f"Number of training samples: {len(train_dataset)}")
+print(f"Number of validation samples: {len(val_dataset)}")
+print(f"Number of test samples: {len(test_dataset)}")
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run detection on validation set
+
+# TODO: use dataloader for efficiency?
+detections_per_validation_sample = {}
+
+for val_idx, (image, annotations) in enumerate(val_dataset):
+    # Apply transforms to frame and place tensor on device
+    image_tensor = inference_transforms(image).to(device)[None]
+
+    # Put annotations in same device as image
+    annotations["boxes"] = annotations["boxes"].to(device)
+    annotations["labels"] = annotations["labels"].to(device)
+
+    # Run detection
+    with torch.no_grad():
+        # use [0] to select the one image in the batch
+        # Returns: dictionary with data of the predicted bounding boxes.
+        # The keys are: "boxes", "scores", and "labels". The labels
+        # refer to the class of the object detected, and not its ID.
+        detections_dict = model(image_tensor)[0]
+
+    # Add to dict
+    bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
+
+    detections_per_validation_sample[val_idx] = {
+        "bbox_xyxy": bboxes_xyxy,
+        "bbox_centroids": (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2,
+        "bbox_shapes": bboxes_xyxy[:, 2:4] - bboxes_xyxy[:, 0:2],
+        "bbox_confidences": detections_dict["scores"].cpu().numpy(),
+        "bbox_labels": detections_dict["labels"].cpu().numpy(),
+    }
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format detections as an ethology detections dataset
+# (validate as ethology annotations dataset? or add from_numpy?)
+
+# Get params for array dimensions
+max_detections_per_frame = max(
+    [
+        dets["bbox_centroids"].shape[0]
+        for dets in detections_per_validation_sample.values()
+    ]
+)
+n_keypoints = 1
+total_n_frames = len(val_dataset)
+
+# Initialise position, shape and label arrays
+array_dict = {}
+array_dict["position_array"] = np.full(
+    (total_n_frames, 2, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_space, n_individuals)
+array_dict["shape_array"] = np.full(
+    (total_n_frames, 2, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_space, n_individuals)
+array_dict["category_array"] = np.full(
+    (total_n_frames, max_detections_per_frame),
+    -1,  # -1 is the default value for missing data
+)  # (n_frames, n_individuals)
+array_dict["score_array"] = np.full(
+    (total_n_frames, max_detections_per_frame),
+    np.nan,
+)  # (n_frames, n_individuals)
+
+# Fill in values
+for frame_idx, dets in detections_per_validation_sample.items():
+    array_dict["position_array"][
+        frame_idx, :, : dets["bbox_centroids"].shape[0]
+    ] = np.transpose(dets["bbox_centroids"])
+    array_dict["shape_array"][frame_idx, :, : dets["bbox_shapes"].shape[0]] = (
+        np.transpose(dets["bbox_shapes"])
+    )
+    array_dict["category_array"][frame_idx, : dets["bbox_labels"].shape[0]] = (
+        dets["bbox_labels"]
+    )
+    array_dict["score_array"][
+        frame_idx, : dets["bbox_confidences"].shape[0]
+    ] = dets["bbox_confidences"]
+
+# Format detections on validation set as ethology detections dataset
+# (detections dataset is a like ethology annotations dataset but with
+# confidence scores)
+ds = xr.Dataset(
+    data_vars=dict(
+        position=(
+            ["image_id", "space", "id"],
+            array_dict["position_array"],
+        ),
+        shape=(["image_id", "space", "id"], array_dict["shape_array"]),
+        category=(["image_id", "id"], array_dict["category_array"]),
+        confidence=(["image_id", "id"], array_dict["score_array"]),
+    ),
+    coords=dict(
+        # use image_id from ground truth annotations!
+        image_id=[
+            val_dataset[i][1]["image_id"] for i in range(total_n_frames)
+        ],
+        space=["x", "y"],
+        id=range(max_detections_per_frame),
+        # annotation ID per frame; could be consistent across frames
+        # or not
+    ),
+)
+
+print(ds)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Get map from image_id to filename from ground truth annotations
+
+cocoGt = COCO(str(dataset_dir / "annotations/VIA_JSON_combined_coco_gen.json"))
+
+# compute map from image_id to filename
+# assuming val_dataset[0][1]['image_id'] is the image_id of the first frame
+# in the validation set, for the image IDs as specified in the ground truth annotation
+# file
+df_images = pd.DataFrame(cocoGt.dataset["images"])
+map_image_id_to_filename_gt = {
+    image_id: filename
+    for image_id, filename in zip(
+        df_images["id"], df_images["file_name"], strict=False
+    )
+}
+
+# # map image_id in xarray to filename
+# map_val_image_id_to_filename = {
+#     idx: map_image_id_to_filename_gt[val_dataset[idx][1]["image_id"]]
+#     for idx in range(total_n_frames)
+# }
+
+# compute map from category_id to category_name
+df_categories = pd.DataFrame(cocoGt.dataset["categories"])
+map_category_id_to_category_name = {
+    category_id: category_name
+    for category_id, category_name in zip(
+        df_categories["id"],
+        df_categories["name"],
+        strict=True,
+    )
+}
+
+# add map to ds
+ds.attrs["map_image_id_to_filename"] = map_image_id_to_filename_gt
+ds.attrs["map_category_id_to_category"] = map_category_id_to_category_name
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Export detections on validation set as COCO JSON file
+
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+filename = Path(
+    f"{mlflow_params['run_name']}_detections_val_set_{timestamp}.json"
+)
+out_path = save_bboxes.to_COCO_file(
+    ds,
+    output_parent_dir / filename,
+)
+
+# Note: this is not an official COCO format for results
+# The format for annotations and detections is different
+# https://cocodataset.org/#format-results

From ca7b4ac2c289c3bf7b1f54ad0bc338004ed5f1f7 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 1 Jul 2025 17:39:32 +0100
Subject: [PATCH 09/72] Trying cocoeval from pycocotools (unsuccessfully)

---
 .../notebook_run_detection_on_dataset.py      | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index 139a3a1a..d057b54d 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -16,6 +16,7 @@
 import xarray as xr
 from mlflow.tracking import MlflowClient
 from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
 from torch.utils.data import random_split
 from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
 from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
@@ -367,3 +368,49 @@ def safe_eval_string(s):
 # Note: this is not an official COCO format for results
 # The format for annotations and detections is different
 # https://cocodataset.org/#format-results
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate with pycocotools
+# from faster_coco_eval import COCO, COCOeval_faster
+
+annType = "bbox"
+prefix = "instances"
+
+
+cocoGt = COCO(
+    "/home/sminano/swc/project_ethology/sept2023_annotations.bk/VIA_JSON_combined_coco_gen.json"
+)
+
+# results file is just a list of dicts!
+cocoDet = cocoGt.loadRes(
+    "/home/sminano/swc/project_ethology/run_slurm_977884_0_detections_val_set_20250701_145412.json"
+)
+
+
+# %%
+# initialise evaluation object
+Eval = COCOeval(cocoGt, cocoDet, annType)
+
+
+# set parameters
+# https://github.com/ppwwyyxx/cocoapi/blob/8cbc887b3da6cb76c7cc5b10f8e082dd29d565cb/PythonAPI/pycocotools/cocoeval.py#L502
+# val_samples_dataset_ids = [sample[1]["image_id"] for sample in val_dataset]
+val_samples_dataset_ids = ds.image_id.values.tolist()
+
+Eval.params.imgIds = val_samples_dataset_ids
+Eval.params.iouThrs = [0.1]
+Eval.params.maxDets = [1000]
+Eval.params.areaRng = [[0**2, 1e5**2]]
+Eval.params.areaRngLbl = ["all"]
+Eval.params.useCats = 0
+Eval.params.recThrs = [0, 1]
+
+# run per image evaluation
+Eval.evaluate()
+
+print(len(Eval.evalImgs))
+
+
+# %%
+print(Eval.evalImgs[0]["image_id"])
+print(Eval.evalImgs[0]["dtMatches"])

From 9cf037227b4b20146ad2a4857dea0f24c6df587f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 1 Jul 2025 17:43:00 +0100
Subject: [PATCH 10/72] Explore using mean_average_precision

---
 .../notebook_run_detection_on_dataset.py      | 266 +++++++-----------
 1 file changed, 99 insertions(+), 167 deletions(-)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index d057b54d..4fc004c3 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -6,22 +6,20 @@
 
 # %%
 import ast
-from datetime import datetime
 from pathlib import Path
 
+import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
+from mean_average_precision import MetricBuilder
 from mlflow.tracking import MlflowClient
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
 from torch.utils.data import random_split
 from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
 from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
 
-from ethology.annotations.io import save_bboxes
+# %matplotlib widget
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set xarray options
@@ -239,178 +237,112 @@ def safe_eval_string(s):
     }
 
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format detections as an ethology detections dataset
-# (validate as ethology annotations dataset? or add from_numpy?)
-
-# Get params for array dimensions
-max_detections_per_frame = max(
-    [
-        dets["bbox_centroids"].shape[0]
-        for dets in detections_per_validation_sample.values()
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute precision and recall per validation sample
+
+pr_per_validation_sample = {}
+
+iou_threshold = 0.1
+recall_threshold = 0.0
+
+
+# the mean average precision package assumes class_id starts at 0!
+# so if there is only one class, it assumes its id is 0
+metric_fn = MetricBuilder.build_evaluation_metric("map_2d", num_classes=1)
+
+for idx_validation_sample in range(len(val_dataset)):
+    # Get ground truth bboxes
+    # [xmin, ymin, xmax, ymax, class_id, difficult, crowd]
+    gt_bboxes_xyxy = np.c_[
+        val_dataset[idx_validation_sample][1]["boxes"].cpu().numpy(),
+        np.zeros(
+            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
+        ),  # class_id = 0
+        np.zeros(
+            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
+        ),  # difficult
+        np.zeros(
+            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
+        ),  # crowd
     ]
-)
-n_keypoints = 1
-total_n_frames = len(val_dataset)
-
-# Initialise position, shape and label arrays
-array_dict = {}
-array_dict["position_array"] = np.full(
-    (total_n_frames, 2, max_detections_per_frame),
-    np.nan,
-)  # (n_frames, n_space, n_individuals)
-array_dict["shape_array"] = np.full(
-    (total_n_frames, 2, max_detections_per_frame),
-    np.nan,
-)  # (n_frames, n_space, n_individuals)
-array_dict["category_array"] = np.full(
-    (total_n_frames, max_detections_per_frame),
-    -1,  # -1 is the default value for missing data
-)  # (n_frames, n_individuals)
-array_dict["score_array"] = np.full(
-    (total_n_frames, max_detections_per_frame),
-    np.nan,
-)  # (n_frames, n_individuals)
-
-# Fill in values
-for frame_idx, dets in detections_per_validation_sample.items():
-    array_dict["position_array"][
-        frame_idx, :, : dets["bbox_centroids"].shape[0]
-    ] = np.transpose(dets["bbox_centroids"])
-    array_dict["shape_array"][frame_idx, :, : dets["bbox_shapes"].shape[0]] = (
-        np.transpose(dets["bbox_shapes"])
-    )
-    array_dict["category_array"][frame_idx, : dets["bbox_labels"].shape[0]] = (
-        dets["bbox_labels"]
-    )
-    array_dict["score_array"][
-        frame_idx, : dets["bbox_confidences"].shape[0]
-    ] = dets["bbox_confidences"]
-
-# Format detections on validation set as ethology detections dataset
-# (detections dataset is a like ethology annotations dataset but with
-# confidence scores)
-ds = xr.Dataset(
-    data_vars=dict(
-        position=(
-            ["image_id", "space", "id"],
-            array_dict["position_array"],
-        ),
-        shape=(["image_id", "space", "id"], array_dict["shape_array"]),
-        category=(["image_id", "id"], array_dict["category_array"]),
-        confidence=(["image_id", "id"], array_dict["score_array"]),
-    ),
-    coords=dict(
-        # use image_id from ground truth annotations!
-        image_id=[
-            val_dataset[i][1]["image_id"] for i in range(total_n_frames)
-        ],
-        space=["x", "y"],
-        id=range(max_detections_per_frame),
-        # annotation ID per frame; could be consistent across frames
-        # or not
-    ),
-)
-
-print(ds)
 
+    # Get predicted bboxes
+    # make class_id 0-indexed!
+    # [xmin, ymin, xmax, ymax, class_id, confidence]
+    pred_bboxes_xyxy_conf = np.c_[
+        detections_per_validation_sample[idx_validation_sample]["bbox_xyxy"],
+        detections_per_validation_sample[idx_validation_sample]["bbox_labels"]
+        - 1,  # class_id is 0-indexed!
+        detections_per_validation_sample[idx_validation_sample][
+            "bbox_confidences"
+        ],
+    ]
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Get map from image_id to filename from ground truth annotations
-
-cocoGt = COCO(str(dataset_dir / "annotations/VIA_JSON_combined_coco_gen.json"))
-
-# compute map from image_id to filename
-# assuming val_dataset[0][1]['image_id'] is the image_id of the first frame
-# in the validation set, for the image IDs as specified in the ground truth annotation
-# file
-df_images = pd.DataFrame(cocoGt.dataset["images"])
-map_image_id_to_filename_gt = {
-    image_id: filename
-    for image_id, filename in zip(
-        df_images["id"], df_images["file_name"], strict=False
-    )
-}
-
-# # map image_id in xarray to filename
-# map_val_image_id_to_filename = {
-#     idx: map_image_id_to_filename_gt[val_dataset[idx][1]["image_id"]]
-#     for idx in range(total_n_frames)
-# }
-
-# compute map from category_id to category_name
-df_categories = pd.DataFrame(cocoGt.dataset["categories"])
-map_category_id_to_category_name = {
-    category_id: category_name
-    for category_id, category_name in zip(
-        df_categories["id"],
-        df_categories["name"],
-        strict=True,
+    # Add gt and pred bboxes to metric
+    metric_fn.reset()
+    metric_fn.add(pred_bboxes_xyxy_conf, gt_bboxes_xyxy)
+    metric = metric_fn.value(
+        iou_thresholds=[iou_threshold],
+        recall_thresholds=np.array([recall_threshold]),
+        mpolicy="soft",
     )
-}
-
-# add map to ds
-ds.attrs["map_image_id_to_filename"] = map_image_id_to_filename_gt
-ds.attrs["map_category_id_to_category"] = map_category_id_to_category_name
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Export detections on validation set as COCO JSON file
-
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-filename = Path(
-    f"{mlflow_params['run_name']}_detections_val_set_{timestamp}.json"
-)
-out_path = save_bboxes.to_COCO_file(
-    ds,
-    output_parent_dir / filename,
-)
-
-# Note: this is not an official COCO format for results
-# The format for annotations and detections is different
-# https://cocodataset.org/#format-results
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate with pycocotools
-# from faster_coco_eval import COCO, COCOeval_faster
-
-annType = "bbox"
-prefix = "instances"
 
+    # compute precision and recall for one frame
+    pr_per_validation_sample[idx_validation_sample] = {
+        "precision": metric[iou_threshold][0]["precision"][-1],
+        "recall": metric[iou_threshold][0]["recall"][-1],
+    }
 
-cocoGt = COCO(
-    "/home/sminano/swc/project_ethology/sept2023_annotations.bk/VIA_JSON_combined_coco_gen.json"
+# average across validation samples
+print(
+    f"Average precision: {
+        np.mean([pr['precision'] for pr in pr_per_validation_sample.values()])
+    }"
 )
-
-# results file is just a list of dicts!
-cocoDet = cocoGt.loadRes(
-    "/home/sminano/swc/project_ethology/run_slurm_977884_0_detections_val_set_20250701_145412.json"
+print(
+    f"Average recall: {
+        np.mean([pr['recall'] for pr in pr_per_validation_sample.values()])
+    }"
 )
 
-
 # %%
-# initialise evaluation object
-Eval = COCOeval(cocoGt, cocoDet, annType)
-
-
-# set parameters
-# https://github.com/ppwwyyxx/cocoapi/blob/8cbc887b3da6cb76c7cc5b10f8e082dd29d565cb/PythonAPI/pycocotools/cocoeval.py#L502
-# val_samples_dataset_ids = [sample[1]["image_id"] for sample in val_dataset]
-val_samples_dataset_ids = ds.image_id.values.tolist()
-
-Eval.params.imgIds = val_samples_dataset_ids
-Eval.params.iouThrs = [0.1]
-Eval.params.maxDets = [1000]
-Eval.params.areaRng = [[0**2, 1e5**2]]
-Eval.params.areaRngLbl = ["all"]
-Eval.params.useCats = 0
-Eval.params.recThrs = [0, 1]
-
-# run per image evaluation
-Eval.evaluate()
-
-print(len(Eval.evalImgs))
+# plot gt and pred bboxes for one frame (idx_validation_sample)
+
+fig, ax = plt.subplots()
+ax.imshow(val_dataset[idx_validation_sample][0].permute(1, 2, 0))
+for i in range(gt_bboxes_xyxy.shape[0]):
+    ax.add_patch(
+        plt.Rectangle(
+            (gt_bboxes_xyxy[i, 0], gt_bboxes_xyxy[i, 1]),
+            gt_bboxes_xyxy[i, 2] - gt_bboxes_xyxy[i, 0],  # width
+            gt_bboxes_xyxy[i, 3] - gt_bboxes_xyxy[i, 1],  # height
+            fill=False,
+            edgecolor=(0, 1, 0),
+            linewidth=2.5,
+        )
+    )
+for i in range(pred_bboxes_xyxy_conf.shape[0]):
+    ax.add_patch(
+        plt.Rectangle(
+            (pred_bboxes_xyxy_conf[i, 0], pred_bboxes_xyxy_conf[i, 1]),
+            pred_bboxes_xyxy_conf[i, 2] - pred_bboxes_xyxy_conf[i, 0],
+            pred_bboxes_xyxy_conf[i, 3] - pred_bboxes_xyxy_conf[i, 1],
+            fill=False,
+            edgecolor="red",
+            linewidth=2,
+        )
+    )
+plt.show()
 
 
 # %%
-print(Eval.evalImgs[0]["image_id"])
-print(Eval.evalImgs[0]["dtMatches"])
+# plot precision and recall for one iou threshold and last frame
+plt.plot(
+    metric[iou_threshold][0]["recall"],
+    metric[iou_threshold][0]["precision"],
+    ".-",
+)
+plt.xlabel("Recall")
+plt.ylabel("Precision")
+plt.title("Precision-Recall Curve")
+plt.show()

From c52918dbfa1a70784cf35107a580abe7c4cf8eba Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 1 Jul 2025 17:44:31 +0100
Subject: [PATCH 11/72] Move notebooks

---
 MANIFEST.in                                   |   2 +-
 notebook_run_detection_on_dataset.py          | 325 ------------------
 .../notebook_run_detection_on_video.py        |   0
 3 files changed, 1 insertion(+), 326 deletions(-)
 delete mode 100644 notebook_run_detection_on_dataset.py
 rename notebook_run_detection_on_video.py => notebooks/notebook_run_detection_on_video.py (100%)

diff --git a/MANIFEST.in b/MANIFEST.in
index 4388fd3b..39095be6 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -16,4 +16,4 @@ recursive-include ethology/annotations/json_schemas/schemas *.md
 
 
 # Temporarily include notebooks
-include *.py
+recursive-include notebooks *.py
diff --git a/notebook_run_detection_on_dataset.py b/notebook_run_detection_on_dataset.py
deleted file mode 100644
index 790287ec..00000000
--- a/notebook_run_detection_on_dataset.py
+++ /dev/null
@@ -1,325 +0,0 @@
-"""Run detection on a Pytorch dataset and export results as a movement dataset.
-
-A script to run detection only (no tracking) on a Pytorch dataset and
-export the results in a format that can be loaded in movement napari widget.
-"""
-
-# %%
-import ast
-from datetime import datetime
-from pathlib import Path
-
-import numpy as np
-import torch
-import torchvision.transforms.v2 as transforms
-from mlflow.tracking import MlflowClient
-from movement.io import load_poses, save_poses
-from torch.utils.data import random_split
-from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
-from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Input data
-dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-
-trained_model_path = Path(
-    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
-)
-
-trained_model_mlflow_params_path = Path(
-    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/params"
-)  # for config
-
-
-# to save output frames and detections
-output_parent_dir = Path("/home/sminano/swc/project_ethology")
-
-flag_save_frames = False
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Set default device: CUDA if available, otherwise mps, otherwise CPU
-device = torch.device(
-    "cuda"
-    if torch.cuda.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
-)
-
-print(f"Using device: {device}")
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Retrieve model config and CLI args from mlflow
-
-
-def read_mlflow_params(
-    trained_model_path: str, tracking_uri: str = None
-) -> dict:
-    """Read parameters for a specific MLflow run."""
-    # Create MLflow client
-    mlruns_path = str(Path(trained_model_path).parents[3])
-    client = MlflowClient(tracking_uri=mlruns_path)
-
-    # Get the run
-    runID = Path(trained_model_path).parents[1].stem
-    run = client.get_run(runID)
-
-    # Access parameters
-    params = run.data.params
-    params["run_name"] = run.info.run_name
-
-    return params
-
-
-mlflow_params = read_mlflow_params(trained_model_path)
-config = {
-    k.removeprefix("config/"): ast.literal_eval(v)
-    for k, v in mlflow_params.items()
-    if k.startswith("config/")
-}
-
-
-def safe_eval_string(s):
-    """Try to evaluate a string as a literal, otherwise return as-is."""
-    try:
-        return ast.literal_eval(s)
-    except (ValueError, SyntaxError):
-        # return as-is if not a valid literal
-        return s
-
-
-cli_args = {
-    k.removeprefix("cli_args/"): safe_eval_string(v)
-    for k, v in mlflow_params.items()
-    if k.startswith("cli_args/")
-}
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Load model
-
-# Load structure
-model = fasterrcnn_resnet50_fpn_v2(
-    weights=None,
-    weights_backbone=None,
-    num_classes=config["num_classes"],
-)
-
-# Read state dict
-state_dict = torch.load(trained_model_path)
-state_dict_model = {
-    k.lstrip("model."): v
-    for k, v in state_dict["state_dict"].items()
-    if k.startswith("model.")
-}
-
-# Load weights into model and set to evaluation mode
-model.load_state_dict(state_dict_model)
-model.eval()
-model.to(device)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Define transforms to apply to input frames
-inference_transforms = transforms.Compose(
-    [
-        transforms.ToImage(),
-        transforms.ToDtype(torch.float32, scale=True),
-    ]
-)
-
-# Sanitize bounding boxes?
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Build Pytorch dataset
-seed_n = cli_args["seed_n"]
-annotations_filename = Path(cli_args["annotation_files"][0]).name
-
-# create "default" COCO dataset
-dataset_coco = CocoDetection(
-    Path(dataset_dir) / "frames",
-    Path(dataset_dir) / "annotations" / annotations_filename,
-    transforms=inference_transforms,
-)
-
-# wrap dataset for transforms v2
-dataset_transformed = wrap_dataset_for_transforms_v2(dataset_coco)
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Split dataset
-# def _collate_fn(self, batch: tuple) -> tuple:
-#     """Collate function used for dataloaders.
-
-#     A custom function is needed for detection
-#     because the number of bounding boxes varies
-#     between images of the same batch.
-#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
-
-#     Parameters
-#     ----------
-#     batch : tuple
-#         a tuple of 2 tuples, the first one holding all images in the batch,
-#         and the second one holding the corresponding annotations.
-
-#     Returns
-#     -------
-#     tuple
-#         a tuple of length = batch size, made up of (image, annotations)
-#         tuples.
-
-#     """
-#     return tuple(zip(*batch))
-
-
-# Split data into train and test-val sets
-rng_train_split = torch.Generator().manual_seed(seed_n)
-rng_val_split = torch.Generator().manual_seed(seed_n)
-
-train_dataset, test_val_dataset = random_split(
-    dataset_transformed,
-    [config["train_fraction"], 1 - config["train_fraction"]],
-    generator=rng_train_split,
-)
-
-# Split test/val sets from the remainder
-test_dataset, val_dataset = random_split(
-    test_val_dataset,
-    [
-        1 - config["val_over_test_fraction"],
-        config["val_over_test_fraction"],
-    ],
-    generator=rng_val_split,
-)
-
-print(f"Seed: {seed_n}")
-print(f"Number of training samples: {len(train_dataset)}")
-print(f"Number of validation samples: {len(val_dataset)}")
-print(f"Number of test samples: {len(test_dataset)}")
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run detection on validation set
-
-# TODO: use dataloader for efficiency?
-detections_per_validation_sample = {}
-
-for val_idx, (image, annotations) in enumerate(val_dataset):
-    # Apply transforms to frame and place tensor on device
-    image_tensor = inference_transforms(image).to(device)[None]
-
-    # Put annotations in same device as image
-    annotations["boxes"] = annotations["boxes"].to(device)
-    annotations["labels"] = annotations["labels"].to(device)
-
-    # Run detection
-    with torch.no_grad():
-        # use [0] to select the one image in the batch
-        # Returns: dictionary with data of the predicted bounding boxes.
-        # The keys are: "boxes", "scores", and "labels". The labels
-        # refer to the class of the object detected, and not its ID.
-        detections_dict = model(image_tensor)[0]  # (n_detections, 4)
-
-    # Add to dict
-    bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
-    bbox_confidences = detections_dict["scores"].cpu().numpy()
-    bbox_centroids = (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2
-
-    detections_per_validation_sample[val_idx] = {
-        "bbox_centroids": bbox_centroids,  # detection_idx, x, y
-        "bbox_confidences": bbox_confidences,  # detection_idx, confidence
-    }
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%
-# Export detections as COCO JSON
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate with pycocotools
-from pycocotools.coco import COCO
-
-annType = "bbox"
-prefix = "instances"
-
-cocoGt = COCO(str(dataset_dir / "annotations/VIA_JSON_combined_coco_gen.json"))
-
-# %%
-# Compute metrics
-# metrics = metric.compute()
-
-# print(metrics["map"])
-
-# %%
-# Show iou > threshold for detections in first image
-# as a matrix with n_rows = n_detections, n_cols = n_gt_boxes
-
-# import matplotlib.pyplot as plt
-
-# # first image, first class
-# plt.imshow(metrics['ious'][(np.int64(0), np.int64(1))] > 0.5)
-
-# # 30th image, first class
-# plt.imshow(metrics['ious'][(np.int64(30), np.int64(1))] > 0.5)
-
-
-# %%
-# P-R curve for first IOU threshold, first class, first area (?), max detections = 1000
-import matplotlib.pyplot as plt
-
-plt.scatter(
-    x=np.arange(0, 1.01, 0.01),  # recall
-    y=metrics_one_frame["precision"][0, :, 0, 0, -1],  # precision
-)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format detections as a movement dataset
-
-# Get params for array dimensions
-max_detections_per_frame = max(
-    [
-        dets["bbox_centroids"].shape[0]
-        for dets in detections_per_validation_sample.values()
-    ]
-)
-n_keypoints = 1
-total_n_frames = len(val_dataset)
-
-# Initialise position and confidence arrays
-position_array = np.full(
-    (total_n_frames, 2, n_keypoints, max_detections_per_frame),
-    np.nan,
-)  # (n_frames, n_space, n_keypoints, n_individuals)
-confidence_array = np.full(
-    (total_n_frames, n_keypoints, max_detections_per_frame),
-    np.nan,
-)  # (n_frames, n_keypoints, n_individuals)
-
-# Fill in values
-for frame_idx, dets in detections_per_validation_sample.items():
-    position_array[frame_idx, :, :, : dets["bbox_centroids"].shape[0]] = (
-        np.transpose(dets["bbox_centroids"][None], (-1, 0, 1))
-    )
-    confidence_array[frame_idx, :, : dets["bbox_centroids"].shape[0]] = dets[
-        "bbox_confidences"
-    ][None, None]
-
-# format as movement dataset
-ds = load_poses.from_numpy(
-    position_array=position_array,
-    confidence_array=confidence_array,
-    individual_names=[
-        f"untracked_{i}" for i in range(max_detections_per_frame)
-    ],
-    keypoint_names=["centroid"],
-)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Export movement dataset as .slp file
-# in the future: export as VIA tracks file (after PR merged!)
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-filename = Path(
-    f"{mlflow_params['run_name']}_detections_val_set_{timestamp}.h5"
-)
-save_poses.to_sleap_analysis_file(ds, output_parent_dir / filename)
-
-# %%
-
-# %%
diff --git a/notebook_run_detection_on_video.py b/notebooks/notebook_run_detection_on_video.py
similarity index 100%
rename from notebook_run_detection_on_video.py
rename to notebooks/notebook_run_detection_on_video.py

From 7765bdc2ac9c7595b5fb300eca5bca36e81f7024 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 2 Jul 2025 23:44:04 +0100
Subject: [PATCH 12/72] Evaluate detections and plot histograms

---
 .../notebook_run_detection_on_dataset.py      | 442 ++++++++++++++----
 1 file changed, 349 insertions(+), 93 deletions(-)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index 4fc004c3..03892295 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -10,11 +10,13 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import torch
+import torchvision.ops as ops
 import torchvision.transforms.v2 as transforms
 import xarray as xr
-from mean_average_precision import MetricBuilder
 from mlflow.tracking import MlflowClient
+from scipy.optimize import linear_sum_assignment
 from torch.utils.data import random_split
 from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
 from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
@@ -25,6 +27,161 @@
 # Set xarray options
 xr.set_options(display_expand_attrs=False)
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Detection evaluation function
+
+
+def evaluate_detections(pred_bboxes, gt_bboxes, iou_threshold=0.5):
+    """Evaluate detection performance using IoU-based matching.
+
+    Parameters
+    ----------
+    pred_bboxes : np.ndarray
+        Array of predicted bounding boxes with columns [x1, y1, x2, y2, confidence]
+    gt_bboxes : np.ndarray
+        Array of ground truth bounding boxes with columns [x1, y1, x2, y2]
+    iou_threshold : float, optional
+        IoU threshold for considering a detection as true positive, default 0.5
+
+    Returns
+    -------
+    tuple
+        (true_positives, false_positives, missed_detections) where each is a boolean array
+        - true_positives: column vector with True for each predicted bbox that is a true positive
+        - false_positives: column vector with True for each predicted bbox that is a false positive
+        - missed_detections: column vector with True for each ground truth bbox that is missed
+
+    """
+    # Initialize output arrays
+    true_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    false_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    missed_detections = np.zeros(len(gt_bboxes), dtype=bool)
+
+    if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
+        # Sort predictions by confidence (descending)
+        sorted_indices = np.argsort(pred_bboxes[:, 4])[::-1]
+        pred_bboxes_sorted = pred_bboxes[sorted_indices]
+
+        # Track which ground truth boxes have been matched
+        gt_matched = np.zeros(len(gt_bboxes), dtype=bool)
+
+        # For each prediction, find the best matching ground truth
+        for i, pred_bbox in enumerate(pred_bboxes_sorted):
+            best_iou = 0
+            best_gt_idx = -1
+
+            # Calculate IoU with all unmatched ground truth boxes
+            for j, gt_bbox in enumerate(gt_bboxes):
+                if gt_matched[j]:
+                    continue
+
+                # Calculate IoU using torchvision.ops.box_iou
+                pred_tensor = torch.tensor(
+                    pred_bbox[:4], dtype=torch.float32
+                ).unsqueeze(0)
+                gt_tensor = torch.tensor(
+                    gt_bbox, dtype=torch.float32
+                ).unsqueeze(0)
+                iou = ops.box_iou(pred_tensor, gt_tensor).item()
+
+                if iou > best_iou:
+                    best_iou = iou
+                    best_gt_idx = j
+
+            # Determine if this prediction is a true positive or false positive
+            pred_idx_in_original = sorted_indices[i]
+
+            if best_iou >= iou_threshold and best_gt_idx >= 0:
+                # True positive
+                true_positives[pred_idx_in_original] = True
+                gt_matched[best_gt_idx] = True
+            else:
+                # False positive
+                false_positives[pred_idx_in_original] = True
+
+        # Mark unmatched ground truth as missed detections
+        missed_detections = ~gt_matched
+
+    elif len(pred_bboxes) == 0 and len(gt_bboxes) > 0:
+        # No predictions, all ground truth are missed
+        missed_detections[:] = True
+    elif len(pred_bboxes) > 0 and len(gt_bboxes) == 0:
+        # No ground truth, all predictions are false positives
+        false_positives[:] = True
+
+    return true_positives, false_positives, missed_detections
+
+
+def evaluate_detections_hungarian(
+    pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
+) -> dict:
+    """Evaluate detection performance using Hungarian algorithm for matching.
+
+    Parameters
+    ----------
+    pred_bboxes : list
+        A list of prediction bounding boxes with columns [x1, y1, x2, y2, confidence]
+    gt_bboxes : list
+        A list of ground truth bounding boxes with columns [x1, y1, x2, y2]
+    iou_threshold : float
+        IoU threshold for considering a detection as true positive
+
+    Returns
+    -------
+    tuple
+        (true_positives, false_positives, missed_detections) where each is a boolean array
+        - true_positives: column vector with True for each predicted bbox that is a true positive
+        - false_positives: column vector with True for each predicted bbox that is a false positive
+        - missed_detections: column vector with True for each ground truth bbox that is missed
+
+    """
+    # Initialize output arrays
+    true_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    false_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
+    missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
+
+    if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
+        # Compute IoU matrix (pred_bboxes x gt_bboxes)
+        iou_matrix = (
+            ops.box_iou(
+                torch.tensor(pred_bboxes[:, :4], dtype=torch.float32),
+                torch.tensor(gt_bboxes, dtype=torch.float32),
+            )
+            .cpu()
+            .numpy()
+        )
+
+        # Use Hungarian algorithm to find optimal assignment
+        pred_indices, gt_indices = linear_sum_assignment(
+            iou_matrix, maximize=True
+        )
+
+        # Mark true positives and false positives based on optimal assignment
+        for pred_idx, gt_idx in zip(pred_indices, gt_indices, strict=True):
+            if iou_matrix[pred_idx, gt_idx] > iou_threshold:
+                true_positives[pred_idx] = True
+                matched_gts[gt_idx] = True
+            else:
+                false_positives[pred_idx] = True
+
+        # Mark unmatched predictions as false positives
+        false_positives[~true_positives] = True
+
+        # Mark unmatched ground truth as missed detections
+        missed_detections[~matched_gts] = True
+
+    elif len(pred_bboxes) == 0 and len(gt_bboxes) > 0:
+        # No predictions, all ground truth are missed
+        missed_detections[:] = True
+    elif len(pred_bboxes) > 0 and len(gt_bboxes) == 0:
+        # No ground truth, all predictions are false positives
+        false_positives[:] = True
+
+    # Return sum as a dict
+    return true_positives, false_positives, missed_detections
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Input data
 dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
@@ -238,111 +395,210 @@ def safe_eval_string(s):
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Compute precision and recall per validation sample
-
-pr_per_validation_sample = {}
+# Evaluate detections using Hungarian algorithm and create dataframes
 
 iou_threshold = 0.1
-recall_threshold = 0.0
-
-
-# the mean average precision package assumes class_id starts at 0!
-# so if there is only one class, it assumes its id is 0
-metric_fn = MetricBuilder.build_evaluation_metric("map_2d", num_classes=1)
-
-for idx_validation_sample in range(len(val_dataset)):
-    # Get ground truth bboxes
-    # [xmin, ymin, xmax, ymax, class_id, difficult, crowd]
-    gt_bboxes_xyxy = np.c_[
-        val_dataset[idx_validation_sample][1]["boxes"].cpu().numpy(),
-        np.zeros(
-            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
-        ),  # class_id = 0
-        np.zeros(
-            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
-        ),  # difficult
-        np.zeros(
-            val_dataset[idx_validation_sample][1]["boxes"].shape[0]
-        ),  # crowd
-    ]
 
-    # Get predicted bboxes
-    # make class_id 0-indexed!
-    # [xmin, ymin, xmax, ymax, class_id, confidence]
-    pred_bboxes_xyxy_conf = np.c_[
-        detections_per_validation_sample[idx_validation_sample]["bbox_xyxy"],
-        detections_per_validation_sample[idx_validation_sample]["bbox_labels"]
-        - 1,  # class_id is 0-indexed!
-        detections_per_validation_sample[idx_validation_sample][
-            "bbox_confidences"
-        ],
-    ]
+# Collect all data efficiently
+list_pred_subtables = []
+list_gt_subtables = []
+
 
-    # Add gt and pred bboxes to metric
-    metric_fn.reset()
-    metric_fn.add(pred_bboxes_xyxy_conf, gt_bboxes_xyxy)
-    metric = metric_fn.value(
-        iou_thresholds=[iou_threshold],
-        recall_thresholds=np.array([recall_threshold]),
-        mpolicy="soft",
+# Loop over validation set
+for val_idx, (image, annotations) in enumerate(val_dataset):
+    # Get predictions for this image
+    pred_dict = detections_per_validation_sample[val_idx]
+    pred_bboxes = np.column_stack(
+        [pred_dict["bbox_xyxy"], pred_dict["bbox_confidences"]]
     )
 
-    # compute precision and recall for one frame
-    pr_per_validation_sample[idx_validation_sample] = {
-        "precision": metric[iou_threshold][0]["precision"][-1],
-        "recall": metric[iou_threshold][0]["recall"][-1],
+    # Get ground truth
+    gt_bboxes = annotations["boxes"].cpu().numpy()
+
+    # Evaluate detections
+    tp, fp, md = evaluate_detections_hungarian(
+        pred_bboxes, gt_bboxes, iou_threshold
+    )
+
+    # Calculate bboxes areas
+    pred_bboxes_width = pred_bboxes[:, 2] - pred_bboxes[:, 0]
+    pred_bboxes_height = pred_bboxes[:, 3] - pred_bboxes[:, 1]
+    pred_areas = pred_bboxes_width * pred_bboxes_height
+
+    gt_bboxes_width = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+    gt_bboxes_height = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+    gt_areas = gt_bboxes_width * gt_bboxes_height
+
+    # Create prediction subtable
+    pred_data = {
+        "prediction_ID": [
+            f"pred_{val_idx}_{i}" for i in range(len(pred_bboxes))
+        ],
+        "image_ID": annotations["image_id"],
+        "confidence": pred_dict["bbox_confidences"],
+        "TP": tp,
+        "FP": fp,
+        "bbox_area": pred_areas,
     }
+    list_pred_subtables.append(pd.DataFrame(pred_data))
 
-# average across validation samples
-print(
-    f"Average precision: {
-        np.mean([pr['precision'] for pr in pr_per_validation_sample.values()])
-    }"
+    # Create ground truth subtable
+    gt_data = {
+        "gt_annotation_ID": [
+            f"gt_{val_idx}_{i}" for i in range(len(gt_bboxes))
+        ],
+        "image_ID": annotations["image_id"],
+        "missed_detection": md,
+        "bbox_area": gt_areas,
+    }
+    list_gt_subtables.append(pd.DataFrame(gt_data))
+
+# Concatenate all dataframes
+predictions_df = pd.concat(list_pred_subtables, ignore_index=True)
+gt_annotations_df = pd.concat(list_gt_subtables, ignore_index=True)
+
+
+# %%
+gt_area_percentiles = np.percentile(
+    gt_annotations_df["bbox_area"], np.arange(0, 105, 5)
 )
-print(
-    f"Average recall: {
-        np.mean([pr['recall'] for pr in pr_per_validation_sample.values()])
-    }"
+
+bin_labels = [
+    f"{gt_area_percentiles[i]:.0f}-{gt_area_percentiles[i + 1]:.0f}"
+    for i in range(gt_area_percentiles.shape[0] - 1)
+]
+
+
+predictions_df["area_bins"] = pd.cut(
+    predictions_df["bbox_area"],
+    bins=gt_area_percentiles,  # same bins for predictions and gt
+    labels=bin_labels,
+    include_lowest=True,
+    right=False,
 )
 
-# %%
-# plot gt and pred bboxes for one frame (idx_validation_sample)
-
-fig, ax = plt.subplots()
-ax.imshow(val_dataset[idx_validation_sample][0].permute(1, 2, 0))
-for i in range(gt_bboxes_xyxy.shape[0]):
-    ax.add_patch(
-        plt.Rectangle(
-            (gt_bboxes_xyxy[i, 0], gt_bboxes_xyxy[i, 1]),
-            gt_bboxes_xyxy[i, 2] - gt_bboxes_xyxy[i, 0],  # width
-            gt_bboxes_xyxy[i, 3] - gt_bboxes_xyxy[i, 1],  # height
-            fill=False,
-            edgecolor=(0, 1, 0),
-            linewidth=2.5,
-        )
-    )
-for i in range(pred_bboxes_xyxy_conf.shape[0]):
-    ax.add_patch(
-        plt.Rectangle(
-            (pred_bboxes_xyxy_conf[i, 0], pred_bboxes_xyxy_conf[i, 1]),
-            pred_bboxes_xyxy_conf[i, 2] - pred_bboxes_xyxy_conf[i, 0],
-            pred_bboxes_xyxy_conf[i, 3] - pred_bboxes_xyxy_conf[i, 1],
-            fill=False,
-            edgecolor="red",
-            linewidth=2,
-        )
-    )
+gt_annotations_df["area_bins"] = pd.cut(
+    gt_annotations_df["bbox_area"],
+    bins=gt_area_percentiles,  # same bins for predictions and gt
+    labels=bin_labels,
+    include_lowest=True,
+    right=False,
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Count detections in each bin
+# Is GT really that balanced??
+predictions_per_area_bin = (
+    predictions_df["area_bins"].value_counts().sort_index()
+)
+gt_per_area_bin = gt_annotations_df["area_bins"].value_counts().sort_index()
+
+comparison_df = pd.DataFrame(
+    {"Predictions": predictions_per_area_bin, "Ground Truth": gt_per_area_bin}
+)
+
+# Plot as bar chart
+plt.figure(figsize=(10, 6))
+comparison_df.plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue", "lightcoral"],
+    stacked=False,
+)
+plt.title("Detection Counts by Area Bins Validation Set")
+plt.xlabel("Area Range (pixels^2)")
+plt.ylabel("Number of Detections")
+plt.xticks(rotation=45)
+plt.grid(True, alpha=0.3)
+plt.tight_layout()
 plt.show()
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Count true positives per bin
+
+true_positives_counts = pd.DataFrame(
+    {
+        "Predictions": predictions_per_area_bin,
+        # "Ground Truth": gt_per_area_bin,
+        "True Positives": predictions_df.loc[predictions_df["TP"], "area_bins"]
+        .value_counts()
+        .sort_index(),
+    }
+)
 
+# Plot as bar chart
+true_positives_counts.plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue", "blue"],
+    stacked=False,
+)
+plt.title("Counts per Area Bin Validation Set")
+plt.xlabel("Bbox area (pixels^2)")
+plt.ylabel("Number of Detections")
+plt.xticks(rotation=45)
+plt.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.show()
 
-# %%
-# plot precision and recall for one iou threshold and last frame
-plt.plot(
-    metric[iou_threshold][0]["recall"],
-    metric[iou_threshold][0]["precision"],
-    ".-",
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Count missed detections per bin
+
+missed_detections_counts = pd.DataFrame(
+    {
+        "Ground Truth": gt_per_area_bin,
+        "Matched Ground Truth": gt_annotations_df.loc[
+            ~gt_annotations_df["missed_detection"], "area_bins"
+        ]
+        .value_counts()
+        .sort_index(),
+    }
+)
+
+# Plot as bar chart
+missed_detections_counts.plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["lightcoral", "green"],
+    stacked=False,
+)
+plt.title("Counts per Area Bin Validation Set")
+plt.xlabel("Area Range (pixels^2)")
+plt.ylabel("Number of Detections")
+plt.xticks(rotation=45)
+plt.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.show()
+
+
+# %%%%%%%%%%%
+# Image id histogram
+
+detections_per_image_id = pd.DataFrame(
+    {
+        "Predictions": predictions_df.groupby("image_ID").count()[
+            "prediction_ID"
+        ],
+        "Ground Truth": gt_annotations_df.groupby("image_ID").count()[
+            "gt_annotation_ID"
+        ],
+        "True Positives": predictions_df.groupby("image_ID")["TP"].sum(),
+    }
+)
+
+# Plot as bar chart
+plt.figure(figsize=(10, 6))
+detections_per_image_id.plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue", "lightcoral", "green"],
+    stacked=False,
 )
-plt.xlabel("Recall")
-plt.ylabel("Precision")
-plt.title("Precision-Recall Curve")
+plt.title("Detections per Image ID")
+plt.xlabel("Image ID")
+plt.ylabel("Number of Detections")
+plt.xticks(rotation=45)
+plt.grid(True, alpha=0.3)
+plt.tight_layout()
 plt.show()

From 831583c0eaf9646fc0a18576aa9993370e53f9d2 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 3 Jul 2025 14:49:31 +0100
Subject: [PATCH 13/72] Update dependencies

---
 pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index db4af857..80577d4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,10 +20,7 @@ classifiers = [
 ]
 dependencies = [
   "movement",
-  "boxmot",
   "mlflow-skinny",
-  "pycocotools",
-  "torchmetrics"
 ]
 
 [project.urls]

From 9a05c952aaed322c2db48f72966fd633ddb097db Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:26:02 +0100
Subject: [PATCH 14/72] Add proto utilities

---
 ethology/datasets/create.py              | 74 ++++++++++++++++++++
 ethology/detectors/evaluate.py           | 79 +++++++++++++++++++++
 ethology/detectors/inference.py          | 89 ++++++++++++++++++++++++
 ethology/detectors/inference/__init__.py |  0
 ethology/detectors/load.py               | 40 +++++++++++
 ethology/detectors/models/__init__.py    |  0
 ethology/detectors/test/__init__.py      |  0
 ethology/detectors/train/__init__.py     |  0
 ethology/detectors/val/__init__.py       |  0
 ethology/mlflow.py                       | 54 ++++++++++++++
 10 files changed, 336 insertions(+)
 create mode 100644 ethology/datasets/create.py
 create mode 100644 ethology/detectors/evaluate.py
 create mode 100644 ethology/detectors/inference.py
 delete mode 100644 ethology/detectors/inference/__init__.py
 create mode 100644 ethology/detectors/load.py
 delete mode 100644 ethology/detectors/models/__init__.py
 delete mode 100644 ethology/detectors/test/__init__.py
 delete mode 100644 ethology/detectors/train/__init__.py
 delete mode 100644 ethology/detectors/val/__init__.py
 create mode 100644 ethology/mlflow.py

diff --git a/ethology/datasets/create.py b/ethology/datasets/create.py
new file mode 100644
index 00000000..bcb10677
--- /dev/null
+++ b/ethology/datasets/create.py
@@ -0,0 +1,74 @@
+"""Utilities for creating datasets."""
+
+from pathlib import Path
+
+import torch
+import torchvision.transforms.v2 as transforms
+from loguru import logger
+from torch.utils.data import random_split
+from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+
+
+def create_coco_dataset(
+    images_dir: str | Path,
+    annotations_file: str | Path,
+    composed_transform: transforms.Compose,
+) -> CocoDetection:
+    """Create a COCO dataset for object detection.
+
+    Note: transforms are applied to the full dataset. If the dataset
+    is later split, all splits will have the same transforms.
+    """
+    dataset_coco = CocoDetection(
+        root=images_dir,
+        annFile=annotations_file,
+        transforms=composed_transform,
+    )
+
+    # wrap dataset for transforms v2
+    dataset_transformed = wrap_dataset_for_transforms_v2(dataset_coco)
+
+    return dataset_transformed
+
+
+def split_dataset(
+    dataset: torch.utils.data.Dataset,
+    train_val_test_fractions: list[float],
+    seed: int,
+) -> tuple[
+    torch.utils.data.Dataset,
+    torch.utils.data.Dataset,
+    torch.utils.data.Dataset,
+]:
+    """Split a dataset into train, validation, and test sets.
+
+    Note: transforms are already applied to the input dataset.
+    """
+    # Check that the fractions sum to 1
+    if sum(train_val_test_fractions) != 1:
+        raise ValueError("The split fractions must sum to 1.")
+
+    # Log transforms applied to the dataset
+    logger.info(
+        f"Dataset transforms (propagated to all splits): {dataset.transforms}"
+    )
+
+    # Create random number generator for reproducibility if seed is provided
+    rng_split = None
+    if seed is not None:
+        rng_split = torch.Generator().manual_seed(seed)
+
+    # Split dataset
+    train_dataset, test_dataset, val_dataset = random_split(
+        dataset,
+        train_val_test_fractions,
+        generator=rng_split,
+    )
+
+    # Print number of samples in each split
+    logger.info(f"Seed: {seed}")
+    logger.info(f"Number of training samples: {len(train_dataset)}")
+    logger.info(f"Number of validation samples: {len(val_dataset)}")
+    logger.info(f"Number of test samples: {len(test_dataset)}")
+
+    return train_dataset, test_dataset, val_dataset
diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
new file mode 100644
index 00000000..d07baba8
--- /dev/null
+++ b/ethology/detectors/evaluate.py
@@ -0,0 +1,79 @@
+"""Utilities for evaluating detectors."""
+
+import numpy as np
+import torch
+import torchvision.ops as ops
+from scipy.optimize import linear_sum_assignment
+
+
+def evaluate_detections_hungarian(
+    pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
+) -> dict:
+    """Compute true positives, false positives, and missed detections.
+
+    Uses Hungarian algorithm for matching.
+
+    Parameters
+    ----------
+    pred_bboxes : list
+        A list of prediction bounding boxes with the first four columns being
+        [x1, y1, x2, y2]
+    gt_bboxes : list
+        A list of ground truth bounding boxes with the first four columns being
+        [x1, y1, x2, y2]
+    iou_threshold : float
+        IoU threshold for considering a detection as true positive
+
+    Returns
+    -------
+    tuple
+        A tuple of three boolean arrays:
+        - true_positives: True for each predicted bbox that is a true positive
+        - false_positives: True for each predicted bbox that is a false positive
+        - missed_detections: True for each ground truth bbox that is missed
+
+    """
+    # Initialize output arrays
+    true_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    false_positives = np.zeros(len(pred_bboxes), dtype=bool)
+    matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
+    missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
+
+    if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
+        # Compute IoU matrix (pred_bboxes x gt_bboxes)
+        iou_matrix = (
+            ops.box_iou(
+                torch.tensor(pred_bboxes[:, :4], dtype=torch.float32),
+                torch.tensor(gt_bboxes, dtype=torch.float32),
+            )
+            .cpu()
+            .numpy()
+        )
+
+        # Use Hungarian algorithm to find optimal assignment
+        pred_indices, gt_indices = linear_sum_assignment(
+            iou_matrix, maximize=True
+        )
+
+        # Mark true positives and false positives based on optimal assignment
+        for pred_idx, gt_idx in zip(pred_indices, gt_indices, strict=True):
+            if iou_matrix[pred_idx, gt_idx] > iou_threshold:
+                true_positives[pred_idx] = True
+                matched_gts[gt_idx] = True
+            else:
+                false_positives[pred_idx] = True
+
+        # Mark unmatched predictions as false positives
+        false_positives[~true_positives] = True
+
+        # Mark unmatched ground truth as missed detections
+        missed_detections[~matched_gts] = True
+
+    elif len(pred_bboxes) == 0 and len(gt_bboxes) > 0:
+        # No predictions, all ground truth are missed
+        missed_detections[:] = True
+    elif len(pred_bboxes) > 0 and len(gt_bboxes) == 0:
+        # No ground truth, all predictions are false positives
+        false_positives[:] = True
+
+    return true_positives, false_positives, missed_detections
diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
new file mode 100644
index 00000000..d32417fb
--- /dev/null
+++ b/ethology/detectors/inference.py
@@ -0,0 +1,89 @@
+"""Inference utilities for detectors."""
+
+import numpy as np
+import torch
+
+
+def run_detector_on_dataset(
+    model: torch.nn.Module,
+    dataset: torch.utils.data.Dataset,  # dataloader instead?
+    device: torch.device,
+) -> dict:
+    """Run detection on each sample of a dataset.
+
+    Note that the dataset transforms are applied to the sampled images.
+    The output is a dictionary with the detections per sample as a dictionary.
+    The detections dictionary has the following keys:
+    - "boxes": tensor of shape [N, 4]
+    - "scores": tensor of shape [N]
+    - "labels": tensor of shape [N]
+    """
+    # Ensure model is in evaluation mode
+    model.eval()
+
+    # Run detection
+    detections_per_sample = {}
+    for idx, (image, annotations) in enumerate(dataset):
+        # Place image tensor on device and add batch dimension
+        image = image.to(device)[None]  # [1, C, H, W]
+
+        # Run detection
+        with torch.no_grad():
+            detections = model(image)[0]  # select single batch dimension
+
+        # Add to dict
+        detections_per_sample[idx] = detections
+
+    return detections_per_sample
+
+
+def run_detector_on_dataloader(
+    model: torch.nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    device: torch.device,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Run detection on a dataloader.
+
+    The output is a list of dictionary with the detections per batch.
+    The detections dictionary has the following keys:
+    - "boxes": tensor of shape [N, 4]
+    - "scores": tensor of shape [N]
+    - "labels": tensor of shape [N]
+    """
+    # Ensure model is in evaluation mode
+    model.eval()
+
+    # Compute detections per batch
+    detections_per_batch = {}
+    for batch_idx, (image_batch, _annotations_batch) in enumerate(dataloader):
+        # Place batch of images on device
+        image_batch = image_batch.to(device)  # [B, C, H, W]
+
+        # Run detection
+        with torch.no_grad():
+            detections_batch = model(
+                image_batch
+            )  # list of n-batch dictionaries
+
+        # Add to dict
+        detections_per_batch[batch_idx] = detections_batch
+
+    return detections_per_batch
+
+
+# def run_detector_on_image(
+#     model: torch.nn.Module,
+#     image: torch.Tensor,
+#     device: torch.device,
+# ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+#     """Run detection on an image."""
+#     pass
+
+
+# def run_detector_on_video(
+#     model: torch.nn.Module,
+#     video_path: str,
+#     device: torch.device,
+# ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+#     """Run detection on a video."""
+#     pass
diff --git a/ethology/detectors/inference/__init__.py b/ethology/detectors/inference/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ethology/detectors/load.py b/ethology/detectors/load.py
new file mode 100644
index 00000000..55a2804d
--- /dev/null
+++ b/ethology/detectors/load.py
@@ -0,0 +1,40 @@
+"""Utilities for loading object detectors."""
+
+import torch
+import torchvision
+
+
+def load_fasterrcnn_resnet50_fpn_v2(
+    trained_model_path: str,
+    num_classes: int,
+    device: torch.device | None = None,
+) -> torch.nn.Module:
+    """Load a Faster R-CNN ResNet50 FPN v2 detector."""
+    # initialize model
+    model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(
+        weights=None,
+        weights_backbone=None,
+        num_classes=num_classes,
+    )
+
+    # load state dict
+    checkpoint = torch.load(trained_model_path)
+
+    # if model is saved with model. prefix, remove it
+    if any([ky.startswith("model.") for ky in checkpoint["state_dict"]]):
+        model_weights = {
+            k.lstrip("model."): v
+            for k, v in checkpoint["state_dict"].items()
+            if k.startswith("model.")
+        }
+    else:
+        model_weights = checkpoint["state_dict"]  # ok?
+
+    # Load weights into model
+    model.load_state_dict(model_weights)
+
+    # Put model on device if provided
+    if device:
+        model.to(device)
+
+    return model
diff --git a/ethology/detectors/models/__init__.py b/ethology/detectors/models/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ethology/detectors/test/__init__.py b/ethology/detectors/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ethology/detectors/train/__init__.py b/ethology/detectors/train/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ethology/detectors/val/__init__.py b/ethology/detectors/val/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/ethology/mlflow.py b/ethology/mlflow.py
new file mode 100644
index 00000000..0e4391c2
--- /dev/null
+++ b/ethology/mlflow.py
@@ -0,0 +1,54 @@
+"""Utilities for MLflow."""
+
+import ast
+from pathlib import Path
+
+from mlflow.tracking import MlflowClient
+
+
+def read_mlflow_params(
+    trained_model_path: str, tracking_uri: str = None
+) -> dict:
+    """Read parameters for a specific MLflow run."""
+    # Create MLflow client
+    mlruns_path = str(Path(trained_model_path).parents[3])
+    client = MlflowClient(tracking_uri=mlruns_path)
+
+    # Get the run
+    runID = Path(trained_model_path).parents[1].stem
+    run = client.get_run(runID)
+
+    # Access parameters
+    params = run.data.params
+    params["run_name"] = run.info.run_name
+
+    return params
+
+
+def read_config_from_mlflow_params(mlflow_params: dict) -> dict:
+    """Read config from MLflow parameters."""
+    config = {
+        k.removeprefix("config/"): ast.literal_eval(v)
+        for k, v in mlflow_params.items()
+        if k.startswith("config/")
+    }
+    return config
+
+
+def read_cli_args_from_mlflow_params(mlflow_params: dict) -> dict:
+    """Read CLI arguments from MLflow parameters."""
+    cli_args = {
+        k.removeprefix("cli_args/"): safe_eval_string(v)
+        for k, v in mlflow_params.items()
+        if k.startswith("cli_args/")
+    }
+    return cli_args
+
+
+def safe_eval_string(s):
+    """Try to evaluate a string as a literal, otherwise return as-is."""
+    try:
+        return ast.literal_eval(s)
+    except (ValueError, SyntaxError):
+        # return as-is if not a valid literal
+        return s

From 23145482a8214dbca5516866fcf218b81fb7fb29 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:47:50 +0100
Subject: [PATCH 15/72] Refactor notebook using new utils. Add precision and
 recall to plots

---
 .../notebook_run_detection_on_dataset.py      | 463 ++++++------------
 1 file changed, 153 insertions(+), 310 deletions(-)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index 03892295..b37e0aea 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -5,200 +5,41 @@
 """
 
 # %%
-import ast
 from pathlib import Path
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import torch
-import torchvision.ops as ops
 import torchvision.transforms.v2 as transforms
 import xarray as xr
-from mlflow.tracking import MlflowClient
-from scipy.optimize import linear_sum_assignment
 from torch.utils.data import random_split
-from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
-from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
 
-# %matplotlib widget
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.evaluate import evaluate_detections_hungarian
+from ethology.detectors.inference import run_detector_on_dataset
+from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set xarray options
 xr.set_options(display_expand_attrs=False)
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Detection evaluation function
-
-
-def evaluate_detections(pred_bboxes, gt_bboxes, iou_threshold=0.5):
-    """Evaluate detection performance using IoU-based matching.
-
-    Parameters
-    ----------
-    pred_bboxes : np.ndarray
-        Array of predicted bounding boxes with columns [x1, y1, x2, y2, confidence]
-    gt_bboxes : np.ndarray
-        Array of ground truth bounding boxes with columns [x1, y1, x2, y2]
-    iou_threshold : float, optional
-        IoU threshold for considering a detection as true positive, default 0.5
-
-    Returns
-    -------
-    tuple
-        (true_positives, false_positives, missed_detections) where each is a boolean array
-        - true_positives: column vector with True for each predicted bbox that is a true positive
-        - false_positives: column vector with True for each predicted bbox that is a false positive
-        - missed_detections: column vector with True for each ground truth bbox that is missed
-
-    """
-    # Initialize output arrays
-    true_positives = np.zeros(len(pred_bboxes), dtype=bool)
-    false_positives = np.zeros(len(pred_bboxes), dtype=bool)
-    missed_detections = np.zeros(len(gt_bboxes), dtype=bool)
-
-    if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
-        # Sort predictions by confidence (descending)
-        sorted_indices = np.argsort(pred_bboxes[:, 4])[::-1]
-        pred_bboxes_sorted = pred_bboxes[sorted_indices]
-
-        # Track which ground truth boxes have been matched
-        gt_matched = np.zeros(len(gt_bboxes), dtype=bool)
-
-        # For each prediction, find the best matching ground truth
-        for i, pred_bbox in enumerate(pred_bboxes_sorted):
-            best_iou = 0
-            best_gt_idx = -1
-
-            # Calculate IoU with all unmatched ground truth boxes
-            for j, gt_bbox in enumerate(gt_bboxes):
-                if gt_matched[j]:
-                    continue
-
-                # Calculate IoU using torchvision.ops.box_iou
-                pred_tensor = torch.tensor(
-                    pred_bbox[:4], dtype=torch.float32
-                ).unsqueeze(0)
-                gt_tensor = torch.tensor(
-                    gt_bbox, dtype=torch.float32
-                ).unsqueeze(0)
-                iou = ops.box_iou(pred_tensor, gt_tensor).item()
-
-                if iou > best_iou:
-                    best_iou = iou
-                    best_gt_idx = j
-
-            # Determine if this prediction is a true positive or false positive
-            pred_idx_in_original = sorted_indices[i]
-
-            if best_iou >= iou_threshold and best_gt_idx >= 0:
-                # True positive
-                true_positives[pred_idx_in_original] = True
-                gt_matched[best_gt_idx] = True
-            else:
-                # False positive
-                false_positives[pred_idx_in_original] = True
-
-        # Mark unmatched ground truth as missed detections
-        missed_detections = ~gt_matched
-
-    elif len(pred_bboxes) == 0 and len(gt_bboxes) > 0:
-        # No predictions, all ground truth are missed
-        missed_detections[:] = True
-    elif len(pred_bboxes) > 0 and len(gt_bboxes) == 0:
-        # No ground truth, all predictions are false positives
-        false_positives[:] = True
-
-    return true_positives, false_positives, missed_detections
-
-
-def evaluate_detections_hungarian(
-    pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
-) -> dict:
-    """Evaluate detection performance using Hungarian algorithm for matching.
-
-    Parameters
-    ----------
-    pred_bboxes : list
-        A list of prediction bounding boxes with columns [x1, y1, x2, y2, confidence]
-    gt_bboxes : list
-        A list of ground truth bounding boxes with columns [x1, y1, x2, y2]
-    iou_threshold : float
-        IoU threshold for considering a detection as true positive
-
-    Returns
-    -------
-    tuple
-        (true_positives, false_positives, missed_detections) where each is a boolean array
-        - true_positives: column vector with True for each predicted bbox that is a true positive
-        - false_positives: column vector with True for each predicted bbox that is a false positive
-        - missed_detections: column vector with True for each ground truth bbox that is missed
-
-    """
-    # Initialize output arrays
-    true_positives = np.zeros(len(pred_bboxes), dtype=bool)
-    false_positives = np.zeros(len(pred_bboxes), dtype=bool)
-    matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
-    missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
-
-    if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
-        # Compute IoU matrix (pred_bboxes x gt_bboxes)
-        iou_matrix = (
-            ops.box_iou(
-                torch.tensor(pred_bboxes[:, :4], dtype=torch.float32),
-                torch.tensor(gt_bboxes, dtype=torch.float32),
-            )
-            .cpu()
-            .numpy()
-        )
-
-        # Use Hungarian algorithm to find optimal assignment
-        pred_indices, gt_indices = linear_sum_assignment(
-            iou_matrix, maximize=True
-        )
-
-        # Mark true positives and false positives based on optimal assignment
-        for pred_idx, gt_idx in zip(pred_indices, gt_indices, strict=True):
-            if iou_matrix[pred_idx, gt_idx] > iou_threshold:
-                true_positives[pred_idx] = True
-                matched_gts[gt_idx] = True
-            else:
-                false_positives[pred_idx] = True
-
-        # Mark unmatched predictions as false positives
-        false_positives[~true_positives] = True
-
-        # Mark unmatched ground truth as missed detections
-        missed_detections[~matched_gts] = True
-
-    elif len(pred_bboxes) == 0 and len(gt_bboxes) > 0:
-        # No predictions, all ground truth are missed
-        missed_detections[:] = True
-    elif len(pred_bboxes) > 0 and len(gt_bboxes) == 0:
-        # No ground truth, all predictions are false positives
-        false_positives[:] = True
-
-    # Return sum as a dict
-    return true_positives, false_positives, missed_detections
-
+# %matplotlib widget
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Input data
+# Input data - in domain
 dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
 
-trained_model_path = Path(
-    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
-)
-
-trained_model_mlflow_params_path = Path(
-    "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/params"
-)  # for config
-
 
-# to save output frames and detections
-output_parent_dir = Path("/home/sminano/swc/project_ethology")
-
-flag_save_frames = False
+trained_model_dict = {
+    "all_data_augm_seed_42": Path(
+        "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
+    )
+}
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set default device: CUDA if available, otherwise mps, otherwise CPU
@@ -215,74 +56,26 @@ def evaluate_detections_hungarian(
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Retrieve model config and CLI args from mlflow
 
-
-def read_mlflow_params(
-    trained_model_path: str, tracking_uri: str = None
-) -> dict:
-    """Read parameters for a specific MLflow run."""
-    # Create MLflow client
-    mlruns_path = str(Path(trained_model_path).parents[3])
-    client = MlflowClient(tracking_uri=mlruns_path)
-
-    # Get the run
-    runID = Path(trained_model_path).parents[1].stem
-    run = client.get_run(runID)
-
-    # Access parameters
-    params = run.data.params
-    params["run_name"] = run.info.run_name
-
-    return params
-
-
 mlflow_params = read_mlflow_params(trained_model_path)
-config = {
-    k.removeprefix("config/"): ast.literal_eval(v)
-    for k, v in mlflow_params.items()
-    if k.startswith("config/")
-}
-
-
-def safe_eval_string(s):
-    """Try to evaluate a string as a literal, otherwise return as-is."""
-    try:
-        return ast.literal_eval(s)
-    except (ValueError, SyntaxError):
-        # return as-is if not a valid literal
-        return s
-
-
-cli_args = {
-    k.removeprefix("cli_args/"): safe_eval_string(v)
-    for k, v in mlflow_params.items()
-    if k.startswith("cli_args/")
-}
+config = read_config_from_mlflow_params(mlflow_params)
+cli_args = read_cli_args_from_mlflow_params(mlflow_params)
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Load model
 
-# Load structure
-model = fasterrcnn_resnet50_fpn_v2(
-    weights=None,
-    weights_backbone=None,
+model = load_fasterrcnn_resnet50_fpn_v2(
+    trained_model_path,
     num_classes=config["num_classes"],
+    device=device,
 )
 
-# Read state dict
-state_dict = torch.load(trained_model_path)
-state_dict_model = {
-    k.lstrip("model."): v
-    for k, v in state_dict["state_dict"].items()
-    if k.startswith("model.")
-}
-
-# Load weights into model and set to evaluation mode
-model.load_state_dict(state_dict_model)
+# Set to evaluation mode
 model.eval()
-model.to(device)
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Define transforms to apply to input frames
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Create COCO dataset
+annotations_filename = Path(cli_args["annotation_files"][0]).name
 inference_transforms = transforms.Compose(
     [
         transforms.ToImage(),
@@ -290,56 +83,25 @@ def safe_eval_string(s):
     ]
 )
 
-# Sanitize bounding boxes?
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Build Pytorch dataset
-seed_n = cli_args["seed_n"]
-annotations_filename = Path(cli_args["annotation_files"][0]).name
-
-# create "default" COCO dataset
-dataset_coco = CocoDetection(
-    Path(dataset_dir) / "frames",
-    Path(dataset_dir) / "annotations" / annotations_filename,
-    transforms=inference_transforms,
+dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=Path(dataset_dir) / "annotations" / annotations_filename,
+    composed_transform=inference_transforms,
 )
 
-# wrap dataset for transforms v2
-dataset_transformed = wrap_dataset_for_transforms_v2(dataset_coco)
-
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Split dataset
-# def _collate_fn(self, batch: tuple) -> tuple:
-#     """Collate function used for dataloaders.
-
-#     A custom function is needed for detection
-#     because the number of bounding boxes varies
-#     between images of the same batch.
-#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
-
-#     Parameters
-#     ----------
-#     batch : tuple
-#         a tuple of 2 tuples, the first one holding all images in the batch,
-#         and the second one holding the corresponding annotations.
-
-#     Returns
-#     -------
-#     tuple
-#         a tuple of length = batch size, made up of (image, annotations)
-#         tuples.
-
-#     """
-#     return tuple(zip(*batch))
-
+# Split dataset like in crabs repo
 
 # Split data into train and test-val sets
+seed_n = cli_args["seed_n"]
 rng_train_split = torch.Generator().manual_seed(seed_n)
 rng_val_split = torch.Generator().manual_seed(seed_n)
 
+# Split train and test-val sets
 train_dataset, test_val_dataset = random_split(
-    dataset_transformed,
+    dataset_coco,
     [config["train_fraction"], 1 - config["train_fraction"]],
     generator=rng_train_split,
 )
@@ -359,30 +121,27 @@ def safe_eval_string(s):
 print(f"Number of validation samples: {len(val_dataset)}")
 print(f"Number of test samples: {len(test_dataset)}")
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Create dataloader
+
+# dataloader = torch.utils.data.DataLoader(
+#     val_dataset,
+#     batch_size=1,
+#     shuffle=True,
+# )
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Run detection on validation set
+detections_dict_per_sample = run_detector_on_dataset(
+    model=model,
+    dataset=val_dataset,
+    device=device,
+)
 
-# TODO: use dataloader for efficiency?
+# reshape
 detections_per_validation_sample = {}
-
-for val_idx, (image, annotations) in enumerate(val_dataset):
-    # Apply transforms to frame and place tensor on device
-    image_tensor = inference_transforms(image).to(device)[None]
-
-    # Put annotations in same device as image
-    annotations["boxes"] = annotations["boxes"].to(device)
-    annotations["labels"] = annotations["labels"].to(device)
-
-    # Run detection
-    with torch.no_grad():
-        # use [0] to select the one image in the batch
-        # Returns: dictionary with data of the predicted bounding boxes.
-        # The keys are: "boxes", "scores", and "labels". The labels
-        # refer to the class of the object detected, and not its ID.
-        detections_dict = model(image_tensor)[0]
-
-    # Add to dict
+for val_idx in range(len(val_dataset)):
+    detections_dict = detections_dict_per_sample[val_idx]
     bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
 
     detections_per_validation_sample[val_idx] = {
@@ -435,6 +194,7 @@ def safe_eval_string(s):
             f"pred_{val_idx}_{i}" for i in range(len(pred_bboxes))
         ],
         "image_ID": annotations["image_id"],
+        "val_batch_idx": val_idx,
         "confidence": pred_dict["bbox_confidences"],
         "TP": tp,
         "FP": fp,
@@ -448,6 +208,7 @@ def safe_eval_string(s):
             f"gt_{val_idx}_{i}" for i in range(len(gt_bboxes))
         ],
         "image_ID": annotations["image_id"],
+        "val_batch_idx": val_idx,
         "missed_detection": md,
         "bbox_area": gt_areas,
     }
@@ -457,8 +218,45 @@ def safe_eval_string(s):
 predictions_df = pd.concat(list_pred_subtables, ignore_index=True)
 gt_annotations_df = pd.concat(list_gt_subtables, ignore_index=True)
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Check average precision and recall on validation set
+
+
+precision_recall_df = pd.DataFrame(
+    {
+        "TP": predictions_df.groupby("image_ID")["TP"].sum(),
+        "FP": predictions_df.groupby("image_ID")["FP"].sum(),
+        "MD": gt_annotations_df.groupby("image_ID")["missed_detection"].sum(),
+        "GT": gt_annotations_df.groupby("image_ID").count()[
+            "gt_annotation_ID"
+        ],
+        "val_batch_idx": predictions_df.groupby("image_ID")[
+            "val_batch_idx"
+        ].first(),
+    }
+)
+
+# sort by val_batch_idx
+precision_recall_df = precision_recall_df.sort_values(by="val_batch_idx")
+precision_recall_df = precision_recall_df.reset_index()
+
+precision_recall_df["precision"] = precision_recall_df["TP"] / (
+    precision_recall_df["TP"] + precision_recall_df["FP"]
+)
+precision_recall_df["recall"] = (
+    precision_recall_df["TP"] / precision_recall_df["GT"]
+)
+
+print(precision_recall_df)
+print(f"Average precision: {precision_recall_df['precision'].mean()}")
+print(f"Average recall: {precision_recall_df['recall'].mean()}")
+
+# Average precision: 0.9456786718983294
+# Average recall: 0.8494677009613534
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Discretize annotations based on area
 
-# %%
 gt_area_percentiles = np.percentile(
     gt_annotations_df["bbox_area"], np.arange(0, 105, 5)
 )
@@ -487,7 +285,7 @@ def safe_eval_string(s):
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Count detections in each bin
+# Count detections in each area bin
 # Is GT really that balanced??
 predictions_per_area_bin = (
     predictions_df["area_bins"].value_counts().sort_index()
@@ -495,7 +293,10 @@ def safe_eval_string(s):
 gt_per_area_bin = gt_annotations_df["area_bins"].value_counts().sort_index()
 
 comparison_df = pd.DataFrame(
-    {"Predictions": predictions_per_area_bin, "Ground Truth": gt_per_area_bin}
+    {
+        "Predictions": predictions_per_area_bin,
+        "Ground Truth": gt_per_area_bin,
+    }
 )
 
 # Plot as bar chart
@@ -519,29 +320,50 @@ def safe_eval_string(s):
 true_positives_counts = pd.DataFrame(
     {
         "Predictions": predictions_per_area_bin,
-        # "Ground Truth": gt_per_area_bin,
         "True Positives": predictions_df.loc[predictions_df["TP"], "area_bins"]
         .value_counts()
         .sort_index(),
     }
 )
 
+true_positives_counts["precision"] = (
+    true_positives_counts["True Positives"]
+    / true_positives_counts["Predictions"]
+)
+
 # Plot as bar chart
-true_positives_counts.plot(
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+true_positives_counts.loc[:, ["Predictions", "True Positives"]].plot(
     kind="bar",
+    ax=ax,
     figsize=(12, 6),
     color=["skyblue", "blue"],
     stacked=False,
 )
-plt.title("Counts per Area Bin Validation Set")
-plt.xlabel("Bbox area (pixels^2)")
-plt.ylabel("Number of Detections")
-plt.xticks(rotation=45)
-plt.grid(True, alpha=0.3)
+ax.set_title("Counts per Area Bin Validation Set")
+ax.set_xlabel("Bbox area (pixels^2)")
+ax.set_ylabel("Number of Detections")
+ax.tick_params(axis="x", rotation=45)
+ax.grid(True, alpha=0.3)
+
+
+# add line plot for precision on right y-axis
+ax2 = ax.twinx()
+ax2.plot(
+    range(len(true_positives_counts)),
+    true_positives_counts["precision"],
+    color="red",
+    marker="o",
+    label="Precision",
+    linewidth=2,
+)
+ax2.set_ylabel("Precision", color="red")
+ax2.tick_params(axis="y", labelcolor="red")
+ax2.set_ylim(0.4, 1.01)  # Precision is between 0 and 1
+
 plt.tight_layout()
 plt.show()
 
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Count missed detections per bin
 
@@ -556,23 +378,44 @@ def safe_eval_string(s):
     }
 )
 
+missed_detections_counts["recall"] = (
+    missed_detections_counts["Matched Ground Truth"]
+    / missed_detections_counts["Ground Truth"]
+)
+
 # Plot as bar chart
-missed_detections_counts.plot(
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+missed_detections_counts.loc[:, ["Ground Truth", "Matched Ground Truth"]].plot(
     kind="bar",
+    ax=ax,
     figsize=(12, 6),
     color=["lightcoral", "green"],
     stacked=False,
 )
-plt.title("Counts per Area Bin Validation Set")
-plt.xlabel("Area Range (pixels^2)")
-plt.ylabel("Number of Detections")
-plt.xticks(rotation=45)
-plt.grid(True, alpha=0.3)
+ax.set_title("Counts per Area Bin Validation Set")
+ax.set_xlabel("Area Range (pixels^2)")
+ax.set_ylabel("Number of Detections")
+ax.tick_params(axis="x", rotation=45)
+ax.grid(True, alpha=0.3)
+
+
+# add line plot for recall on right y-axis
+ax2 = ax.twinx()
+ax2.plot(
+    range(len(missed_detections_counts)),
+    missed_detections_counts["recall"],
+    color="blue",
+    marker="o",
+    linewidth=2,
+)
+ax2.tick_params(axis="y", labelcolor="blue")
+ax2.set_ylabel("Recall", color="blue")
+ax2.set_ylim(0.4, 1.01)  # Recall is between 0 and 1
+
 plt.tight_layout()
 plt.show()
 
-
-# %%%%%%%%%%%
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Image id histogram
 
 detections_per_image_id = pd.DataFrame(

From af631b1adb6a998d551010bc726d323d222e72db Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 3 Jul 2025 18:48:56 +0100
Subject: [PATCH 16/72] Change notebook to binning by diagonal

---
 .../notebook_run_detection_on_dataset.py      | 142 ++++++++++++------
 1 file changed, 95 insertions(+), 47 deletions(-)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index b37e0aea..264b751a 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -13,6 +13,7 @@
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
+from pycocotools.coco import COCO
 from torch.utils.data import random_split
 
 from ethology.datasets.create import create_coco_dataset
@@ -35,12 +36,54 @@
 dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
 
 
-trained_model_dict = {
-    "all_data_augm_seed_42": Path(
-        "/home/sminano/swc/project_crabs/ml-runs/617393114420881798/f348d9d196934073bece1b877cbc4d38/checkpoints/last.ckpt"
-    )
+experiment_ID = "617393114420881798"
+ml_runs_experiment_dir = (
+    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
+)
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+
+
+# percentile is of bbox diagonal!
+models_dict = {
+    "above_0th_percentile_seed_42": (
+        ml_runs_experiment_dir
+        / "f348d9d196934073bece1b877cbc4d38"
+        / "checkpoints"
+        / "last.ckpt"
+    ),
+    "above_5th_percentile_seed_42": (
+        ml_runs_experiment_dir
+        / "e72e53b23df142ae859dd590798b4162"
+        / "checkpoints"
+        / "last.ckpt"
+    ),
 }
 
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+full_gt_annotations_file = annotations_dir / "VIA_JSON_combined_coco_gen.json"
+coco_full_gt = COCO(str(full_gt_annotations_file))
+
+# compute diagonal percentiles for full gt
+gt_bboxes_diagonals = [
+    np.sqrt(
+        annot["bbox"][2] ** 2 + annot["bbox"][3] ** 2
+    )  # bbox is xywh in COCO
+    for annot in coco_full_gt.dataset["annotations"]
+]
+gt_diagonal_percentiles = np.percentile(
+    gt_bboxes_diagonals, np.arange(0, 105, 5)
+)
+
+
+bin_labels = [
+    f"{gt_diagonal_percentiles[i]:.0f}-{gt_diagonal_percentiles[i + 1]:.0f}"
+    for i in range(gt_diagonal_percentiles.shape[0] - 1)
+]
+
+print(bin_labels)
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set default device: CUDA if available, otherwise mps, otherwise CPU
 device = torch.device(
@@ -56,6 +99,8 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Retrieve model config and CLI args from mlflow
 
+trained_model_path = str(models_dict["above_5th_percentile_seed_42"])
+
 mlflow_params = read_mlflow_params(trained_model_path)
 config = read_config_from_mlflow_params(mlflow_params)
 cli_args = read_cli_args_from_mlflow_params(mlflow_params)
@@ -76,6 +121,8 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Create COCO dataset
 annotations_filename = Path(cli_args["annotation_files"][0]).name
+print(annotations_filename)
+
 inference_transforms = transforms.Compose(
     [
         transforms.ToImage(),
@@ -86,7 +133,7 @@
 
 dataset_coco = create_coco_dataset(
     images_dir=Path(dataset_dir) / "frames",
-    annotations_file=Path(dataset_dir) / "annotations" / annotations_filename,
+    annotations_file=annotations_dir / annotations_filename,
     composed_transform=inference_transforms,
 )
 
@@ -117,9 +164,9 @@
 )
 
 print(f"Seed: {seed_n}")
-print(f"Number of training samples: {len(train_dataset)}")
-print(f"Number of validation samples: {len(val_dataset)}")
-print(f"Number of test samples: {len(test_dataset)}")
+print(f"Number of training samples: {len(train_dataset)}")  # images
+print(f"Number of validation samples: {len(val_dataset)}")  # images
+print(f"Number of test samples: {len(test_dataset)}")  # images
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Create dataloader
@@ -179,14 +226,14 @@
         pred_bboxes, gt_bboxes, iou_threshold
     )
 
-    # Calculate bboxes areas
+    # Calculate bboxes diagonals
     pred_bboxes_width = pred_bboxes[:, 2] - pred_bboxes[:, 0]
     pred_bboxes_height = pred_bboxes[:, 3] - pred_bboxes[:, 1]
-    pred_areas = pred_bboxes_width * pred_bboxes_height
+    pred_diagonals = np.sqrt(pred_bboxes_width**2 + pred_bboxes_height**2)
 
     gt_bboxes_width = gt_bboxes[:, 2] - gt_bboxes[:, 0]
     gt_bboxes_height = gt_bboxes[:, 3] - gt_bboxes[:, 1]
-    gt_areas = gt_bboxes_width * gt_bboxes_height
+    gt_diagonals = np.sqrt(gt_bboxes_width**2 + gt_bboxes_height**2)
 
     # Create prediction subtable
     pred_data = {
@@ -198,7 +245,7 @@
         "confidence": pred_dict["bbox_confidences"],
         "TP": tp,
         "FP": fp,
-        "bbox_area": pred_areas,
+        "bbox_diagonal": pred_diagonals,
     }
     list_pred_subtables.append(pd.DataFrame(pred_data))
 
@@ -210,7 +257,7 @@
         "image_ID": annotations["image_id"],
         "val_batch_idx": val_idx,
         "missed_detection": md,
-        "bbox_area": gt_areas,
+        "bbox_diagonal": gt_diagonals,
     }
     list_gt_subtables.append(pd.DataFrame(gt_data))
 
@@ -251,33 +298,26 @@
 print(f"Average precision: {precision_recall_df['precision'].mean()}")
 print(f"Average recall: {precision_recall_df['recall'].mean()}")
 
+
+# all annotations:
 # Average precision: 0.9456786718983294
 # Average recall: 0.8494677009613534
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Discretize annotations based on area
+# Discretize annotations based on diagonal
 
-gt_area_percentiles = np.percentile(
-    gt_annotations_df["bbox_area"], np.arange(0, 105, 5)
-)
 
-bin_labels = [
-    f"{gt_area_percentiles[i]:.0f}-{gt_area_percentiles[i + 1]:.0f}"
-    for i in range(gt_area_percentiles.shape[0] - 1)
-]
-
-
-predictions_df["area_bins"] = pd.cut(
-    predictions_df["bbox_area"],
-    bins=gt_area_percentiles,  # same bins for predictions and gt
+predictions_df["diagonal_bins"] = pd.cut(
+    predictions_df["bbox_diagonal"],
+    bins=gt_diagonal_percentiles,  # same bins for predictions and gt
     labels=bin_labels,
     include_lowest=True,
     right=False,
 )
 
-gt_annotations_df["area_bins"] = pd.cut(
-    gt_annotations_df["bbox_area"],
-    bins=gt_area_percentiles,  # same bins for predictions and gt
+gt_annotations_df["diagonal_bins"] = pd.cut(
+    gt_annotations_df["bbox_diagonal"],
+    bins=gt_diagonal_percentiles,  # same bins for predictions and gt
     labels=bin_labels,
     include_lowest=True,
     right=False,
@@ -285,17 +325,19 @@
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Count detections in each area bin
+# Count detections in each diagonal bin
 # Is GT really that balanced??
-predictions_per_area_bin = (
-    predictions_df["area_bins"].value_counts().sort_index()
+predictions_per_diagonal_bin = (
+    predictions_df["diagonal_bins"].value_counts().sort_index()
+)
+gt_per_diagonal_bin = (
+    gt_annotations_df["diagonal_bins"].value_counts().sort_index()
 )
-gt_per_area_bin = gt_annotations_df["area_bins"].value_counts().sort_index()
 
 comparison_df = pd.DataFrame(
     {
-        "Predictions": predictions_per_area_bin,
-        "Ground Truth": gt_per_area_bin,
+        "Predictions": predictions_per_diagonal_bin,
+        "Ground Truth": gt_per_diagonal_bin,
     }
 )
 
@@ -307,8 +349,8 @@
     color=["skyblue", "lightcoral"],
     stacked=False,
 )
-plt.title("Detection Counts by Area Bins Validation Set")
-plt.xlabel("Area Range (pixels^2)")
+plt.title("Detection Counts by Diagonal Bins Validation Set")
+plt.xlabel("Diagonal (pixels)")
 plt.ylabel("Number of Detections")
 plt.xticks(rotation=45)
 plt.grid(True, alpha=0.3)
@@ -319,8 +361,10 @@
 
 true_positives_counts = pd.DataFrame(
     {
-        "Predictions": predictions_per_area_bin,
-        "True Positives": predictions_df.loc[predictions_df["TP"], "area_bins"]
+        "Predictions": predictions_per_diagonal_bin,
+        "True Positives": predictions_df.loc[
+            predictions_df["TP"], "diagonal_bins"
+        ]
         .value_counts()
         .sort_index(),
     }
@@ -340,10 +384,11 @@
     color=["skyblue", "blue"],
     stacked=False,
 )
-ax.set_title("Counts per Area Bin Validation Set")
-ax.set_xlabel("Bbox area (pixels^2)")
+ax.set_title("Counts per Diagonal Bin Validation Set")
+ax.set_xlabel("Diagonal (pixels)")
 ax.set_ylabel("Number of Detections")
 ax.tick_params(axis="x", rotation=45)
+ax.set_ylim(0.0, 325)
 ax.grid(True, alpha=0.3)
 
 
@@ -359,7 +404,7 @@
 )
 ax2.set_ylabel("Precision", color="red")
 ax2.tick_params(axis="y", labelcolor="red")
-ax2.set_ylim(0.4, 1.01)  # Precision is between 0 and 1
+ax2.set_ylim(0.0, 1.00)  # Precision is between 0 and 1
 
 plt.tight_layout()
 plt.show()
@@ -369,9 +414,9 @@
 
 missed_detections_counts = pd.DataFrame(
     {
-        "Ground Truth": gt_per_area_bin,
+        "Ground Truth": gt_per_diagonal_bin,
         "Matched Ground Truth": gt_annotations_df.loc[
-            ~gt_annotations_df["missed_detection"], "area_bins"
+            ~gt_annotations_df["missed_detection"], "diagonal_bins"
         ]
         .value_counts()
         .sort_index(),
@@ -392,10 +437,11 @@
     color=["lightcoral", "green"],
     stacked=False,
 )
-ax.set_title("Counts per Area Bin Validation Set")
-ax.set_xlabel("Area Range (pixels^2)")
+ax.set_title("Counts per Diagonal Bin Validation Set")
+ax.set_xlabel("Diagonal (pixels)")
 ax.set_ylabel("Number of Detections")
 ax.tick_params(axis="x", rotation=45)
+ax.set_ylim(0.0, 325)
 ax.grid(True, alpha=0.3)
 
 
@@ -410,7 +456,7 @@
 )
 ax2.tick_params(axis="y", labelcolor="blue")
 ax2.set_ylabel("Recall", color="blue")
-ax2.set_ylim(0.4, 1.01)  # Recall is between 0 and 1
+ax2.set_ylim(0.0, 1.00)  # Recall is between 0 and 1
 
 plt.tight_layout()
 plt.show()
@@ -445,3 +491,5 @@
 plt.grid(True, alpha=0.3)
 plt.tight_layout()
 plt.show()
+
+# %%

From 8a46f658aa8b46410edbd8aff89bb780300d6bb6 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 3 Jul 2025 21:30:42 +0100
Subject: [PATCH 17/72] Add calibration plot

---
 .../notebook_run_detection_on_dataset.py      | 125 +++++++++++++-----
 1 file changed, 93 insertions(+), 32 deletions(-)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index 264b751a..714dbb79 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -84,6 +84,40 @@
 
 print(bin_labels)
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+def split_dataset_crab_repo(dataset_coco, seed_n):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set default device: CUDA if available, otherwise mps, otherwise CPU
 device = torch.device(
@@ -99,7 +133,7 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Retrieve model config and CLI args from mlflow
 
-trained_model_path = str(models_dict["above_5th_percentile_seed_42"])
+trained_model_path = str(models_dict["above_0th_percentile_seed_42"])
 
 mlflow_params = read_mlflow_params(trained_model_path)
 config = read_config_from_mlflow_params(mlflow_params)
@@ -138,35 +172,12 @@
 )
 
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Split dataset like in crabs repo
-
-# Split data into train and test-val sets
-seed_n = cli_args["seed_n"]
-rng_train_split = torch.Generator().manual_seed(seed_n)
-rng_val_split = torch.Generator().manual_seed(seed_n)
-
-# Split train and test-val sets
-train_dataset, test_val_dataset = random_split(
+train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
     dataset_coco,
-    [config["train_fraction"], 1 - config["train_fraction"]],
-    generator=rng_train_split,
-)
-
-# Split test/val sets from the remainder
-test_dataset, val_dataset = random_split(
-    test_val_dataset,
-    [
-        1 - config["val_over_test_fraction"],
-        config["val_over_test_fraction"],
-    ],
-    generator=rng_val_split,
+    seed_n=cli_args["seed_n"],
 )
 
-print(f"Seed: {seed_n}")
-print(f"Number of training samples: {len(train_dataset)}")  # images
-print(f"Number of validation samples: {len(val_dataset)}")  # images
-print(f"Number of test samples: {len(test_dataset)}")  # images
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Create dataloader
@@ -185,6 +196,8 @@
     device=device,
 )
 
+# save as movement bbox dataset / netcdf?
+
 # reshape
 detections_per_validation_sample = {}
 for val_idx in range(len(val_dataset)):
@@ -211,12 +224,10 @@
 
 
 # Loop over validation set
-for val_idx, (image, annotations) in enumerate(val_dataset):
+for val_idx, (_img, annotations) in enumerate(val_dataset):
     # Get predictions for this image
     pred_dict = detections_per_validation_sample[val_idx]
-    pred_bboxes = np.column_stack(
-        [pred_dict["bbox_xyxy"], pred_dict["bbox_confidences"]]
-    )
+    pred_bboxes = pred_dict["bbox_xyxy"]
 
     # Get ground truth
     gt_bboxes = annotations["boxes"].cpu().numpy()
@@ -268,7 +279,6 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Check average precision and recall on validation set
 
-
 precision_recall_df = pd.DataFrame(
     {
         "TP": predictions_df.groupby("image_ID")["TP"].sum(),
@@ -303,6 +313,57 @@
 # Average precision: 0.9456786718983294
 # Average recall: 0.8494677009613534
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Discretize predictions based on confidence
+bin_edges = np.arange(0, 1.01, 0.05)
+predictions_df["confidence_bins"] = pd.cut(
+    predictions_df["confidence"],
+    bins=bin_edges,
+)
+
+precision_per_confidence_bin = predictions_df.groupby(
+    "confidence_bins", observed=False
+)["TP"].sum()
+total_detections_per_confidence_bin = (
+    predictions_df["confidence_bins"].value_counts().sort_index()
+)
+
+calibration_df = pd.DataFrame(
+    {
+        "precision": precision_per_confidence_bin
+        / total_detections_per_confidence_bin,
+        "total_detections": total_detections_per_confidence_bin,
+        "TP": precision_per_confidence_bin,
+    }
+)
+
+# Plot as bar chart
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+calibration_df["precision"].plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue"],
+    ax=ax,
+)
+
+ax.plot(
+    np.arange(len(calibration_df)),  # bin indices
+    (bin_edges[:-1] + bin_edges[1:]) / 2,  # perfect calibration
+    color="red",
+    linewidth=2,
+    marker="o",
+    label="Perfect calibration",
+)
+
+ax.set_title(f"Calibration curve (n={precision_per_confidence_bin.sum()})")
+ax.set_xlabel("confidence")
+ax.set_ylabel("Precision")
+ax.tick_params(axis="x", rotation=45)
+ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Discretize annotations based on diagonal
 
@@ -311,7 +372,7 @@
     predictions_df["bbox_diagonal"],
     bins=gt_diagonal_percentiles,  # same bins for predictions and gt
     labels=bin_labels,
-    include_lowest=True,
+    include_lowest=True,  # --- change
     right=False,
 )
 

From c2a7282e9efa05a7bafda68389520f61759671fe Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 14 Jul 2025 10:27:55 +0100
Subject: [PATCH 18/72] Rename

---
 ...ction_on_dataset.py => notebook_bin_detections.py} | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
 rename notebooks/{notebook_run_detection_on_dataset.py => notebook_bin_detections.py} (98%)

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_bin_detections.py
similarity index 98%
rename from notebooks/notebook_run_detection_on_dataset.py
rename to notebooks/notebook_bin_detections.py
index 714dbb79..1465b6d0 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_bin_detections.py
@@ -34,13 +34,12 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Input data - in domain
 dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
 
 experiment_ID = "617393114420881798"
 ml_runs_experiment_dir = (
     Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
 )
-annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
 
 
 # percentile is of bbox diagonal!
@@ -61,22 +60,26 @@
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute bins using full GT annotations
+# We bin the size of the bbox diagonal
 
 full_gt_annotations_file = annotations_dir / "VIA_JSON_combined_coco_gen.json"
 coco_full_gt = COCO(str(full_gt_annotations_file))
 
-# compute diagonal percentiles for full gt
+# compute diagonals for each gt annotation
 gt_bboxes_diagonals = [
     np.sqrt(
         annot["bbox"][2] ** 2 + annot["bbox"][3] ** 2
     )  # bbox is xywh in COCO
     for annot in coco_full_gt.dataset["annotations"]
 ]
+
+# compute percentiles of diagonals
 gt_diagonal_percentiles = np.percentile(
     gt_bboxes_diagonals, np.arange(0, 105, 5)
 )
 
-
+# define labels for bins
 bin_labels = [
     f"{gt_diagonal_percentiles[i]:.0f}-{gt_diagonal_percentiles[i + 1]:.0f}"
     for i in range(gt_diagonal_percentiles.shape[0] - 1)

From b9b5a99e5f08015eb7fcf7ac26d38a9abb67b138 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 14 Jul 2025 13:36:32 +0100
Subject: [PATCH 19/72] Return detections from pytorch dataset as xarray ds

---
 ethology/detectors/inference.py | 152 +++++++++++++++++++++++++++++++-
 pyproject.toml                  |   1 +
 2 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index d32417fb..6ea34337 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -2,6 +2,116 @@
 
 import numpy as np
 import torch
+import xarray as xr
+
+
+def _pad_sequence_along_detections_dim(
+    array: np.ndarray, max_n_detections_per_image: int
+) -> tuple:
+    """Return sequence for padding input array along detections dimension."""
+    pad_detections_per_image = max_n_detections_per_image - array.shape[0]
+    return tuple(
+        (0, pad_detections_per_image) if i == 0 else (0, 0)
+        for i in range(array.ndim)
+    )
+
+
+def _detections_per_sample_as_ds(
+    detections_per_sample: dict,
+) -> xr.Dataset:
+    """Reshape detections per sample as xarray dataset."""
+    # Get coordinates
+    list_image_id_coords = list(detections_per_sample.keys())
+    list_space_coords = ["x", "y"]
+    max_n_detections_per_image = max(
+        [
+            detections["boxes"].shape[0]
+            for detections in detections_per_sample.values()
+        ]
+    )
+
+    list_id_coords = list(range(max_n_detections_per_image))  # per frame
+    coords_dict = {
+        "image_id": list_image_id_coords,
+        "space": list_space_coords,
+        "id": list_id_coords,  # per frame
+    }
+    coords_dict_no_space = coords_dict.copy()
+    del coords_dict_no_space["space"]
+
+    # Get lists of data arrays
+    list_centroid_arrays = [
+        (
+            detections["boxes"].cpu().numpy()[:, 0:2]
+            + detections["boxes"].cpu().numpy()[:, 2:4]
+        )
+        * 0.5
+        for detections in detections_per_sample.values()
+    ]
+
+    list_shape_arrays = [
+        detections["boxes"].cpu().numpy()[:, 2:4]
+        - detections["boxes"].cpu().numpy()[:, 0:2]
+        for detections in detections_per_sample.values()
+    ]
+
+    list_confidence_arrays = [
+        detections["scores"].cpu().numpy()  # .reshape(-1, 1)
+        for detections in detections_per_sample.values()
+    ]
+
+    list_label_arrays = [
+        detections["labels"].cpu().numpy()  # .reshape(-1, 1)
+        for detections in detections_per_sample.values()
+    ]
+
+    # Define arrays to create
+    arrays_dict = {
+        "centroids": {
+            "data": list_centroid_arrays,
+            "coords": coords_dict,
+            "pad_value": np.nan,
+        },
+        "shape": {
+            "data": list_shape_arrays,
+            "coords": coords_dict,
+            "pad_value": np.nan,
+        },
+        "confidence": {
+            "data": list_confidence_arrays,
+            "coords": coords_dict_no_space,
+            "pad_value": np.nan,
+        },
+        "label": {
+            "data": list_label_arrays,
+            "coords": coords_dict_no_space,
+            "pad_value": -1,
+        },
+    }
+
+    # Create all DataArrays in a loop
+    data_arrays = {}
+    for name in arrays_dict:
+        data_arrays[name] = xr.DataArray(
+            data=np.stack(
+                [
+                    np.pad(
+                        array,
+                        _pad_sequence_along_detections_dim(
+                            array, max_n_detections_per_image
+                        ),
+                        mode="constant",
+                        constant_values=arrays_dict[name]["pad_value"],
+                    ).T
+                    for array in arrays_dict[name]["data"]
+                ],
+                axis=0,  # need to pad with nans for constant shape
+            ),
+            dims=list(arrays_dict[name]["coords"].keys()),
+            coords=arrays_dict[name]["coords"],
+        )
+
+    return xr.Dataset(data_vars=data_arrays)
 
 
 def run_detector_on_dataset(
@@ -23,7 +133,7 @@ def run_detector_on_dataset(
 
     # Run detection
     detections_per_sample = {}
-    for idx, (image, annotations) in enumerate(dataset):
+    for idx, (image, _annotations) in enumerate(dataset):
         # Place image tensor on device and add batch dimension
         image = image.to(device)[None]  # [1, C, H, W]
 
@@ -34,7 +144,17 @@ def run_detector_on_dataset(
         # Add to dict
         detections_per_sample[idx] = detections
 
-    return detections_per_sample
+    # Format as xarray dataset
+    detections_dataset = _detections_per_sample_as_ds(detections_per_sample)
+
+    return detections_dataset
+
+
+def _detections_per_batch_as_ds(
+    detections_per_batch: dict,
+) -> xr.Dataset:
+    """Reshape detections per batch as xarray dataset."""
+    pass
 
 
 def run_detector_on_dataloader(
@@ -44,7 +164,7 @@ def run_detector_on_dataloader(
 ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Run detection on a dataloader.
 
-    The output is a list of dictionary with the detections per batch.
+    The output is a dictionary with the detections per batch as a list.
     The detections dictionary has the following keys:
     - "boxes": tensor of shape [N, 4]
     - "scores": tensor of shape [N]
@@ -57,7 +177,7 @@ def run_detector_on_dataloader(
     detections_per_batch = {}
     for batch_idx, (image_batch, _annotations_batch) in enumerate(dataloader):
         # Place batch of images on device
-        image_batch = image_batch.to(device)  # [B, C, H, W]
+        image_batch = [img.to(device) for img in image_batch]  # [B, C, H, W]
 
         # Run detection
         with torch.no_grad():
@@ -71,6 +191,30 @@ def run_detector_on_dataloader(
     return detections_per_batch
 
 
+def collate_fn_varying_n_bboxes(batch: tuple) -> tuple:
+    """Collate function for dataloader with varying number of bounding boxes.
+
+    A custom function is needed for detection
+    because the number of bounding boxes varies
+    between images of the same batch.
+    See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
+
+    Parameters
+    ----------
+    batch : tuple
+        a tuple of 2 tuples, the first one holding all images in the batch,
+        and the second one holding the corresponding annotations.
+
+    Returns
+    -------
+    tuple
+        a tuple of length = batch size, made up of (image, annotations)
+        tuples.
+
+    """
+    return tuple(zip(*batch, strict=False))
+
+
 # def run_detector_on_image(
 #     model: torch.nn.Module,
 #     image: torch.Tensor,
diff --git a/pyproject.toml b/pyproject.toml
index 80577d4b..7616a0ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ classifiers = [
 dependencies = [
   "movement",
   "mlflow-skinny",
+  "netCDF4",
 ]
 
 [project.urls]

From c9b8effcee519eaffe82831e4182dcb58358431a Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 14 Jul 2025 13:37:25 +0100
Subject: [PATCH 20/72] Save detections as xarray datasets

---
 .../notebook_run_detection_on_dataset.py      | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 notebooks/notebook_run_detection_on_dataset.py

diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
new file mode 100644
index 00000000..cc8e0619
--- /dev/null
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -0,0 +1,217 @@
+"""Run detection on a Pytorch dataset and export results as a movement dataset.
+
+A script to run detection only (no tracking) on a Pytorch dataset and
+export the results in a format that can be loaded in movement napari widget.
+"""
+
+# %%
+import json
+from datetime import datetime
+from pathlib import Path
+
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from torch.utils.data import random_split
+
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.inference import run_detector_on_dataset
+from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data - in domain
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+
+experiment_ID = "617393114420881798"
+ml_runs_experiment_dir = (
+    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
+)
+
+# percentile is of bbox diagonal!
+models_dict = {
+    "above_0th": (
+        ml_runs_experiment_dir
+        / "f348d9d196934073bece1b877cbc4d38"
+        / "checkpoints"
+        / "last.ckpt"
+    ),
+    "above_5th": (
+        ml_runs_experiment_dir
+        / "e72e53b23df142ae859dd590798b4162"
+        / "checkpoints"
+        / "last.ckpt"
+    ),
+}
+
+output_dir = Path(
+    "/home/sminano/swc/project_ethology/remove_small_bboxes_inD_output"
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+print(f"Using device: {device}")
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Helper functions
+
+
+def split_dataset_crab_repo(dataset_coco, seed_n, config):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute detections for each model
+
+for model_key in models_dict:
+    # Retrieve model config and CLI args from mlflow
+    trained_model_path = str(models_dict["above_0th"])
+
+    mlflow_params = read_mlflow_params(trained_model_path)
+    config = read_config_from_mlflow_params(mlflow_params)
+    cli_args = read_cli_args_from_mlflow_params(mlflow_params)
+
+    # ------------------------------------
+    # Load model
+    model = load_fasterrcnn_resnet50_fpn_v2(
+        trained_model_path,
+        num_classes=config["num_classes"],
+        device=device,
+    )
+    model.eval()
+
+    # ------------------------------------
+    # Create COCO dataset
+    annotations_filename = Path(cli_args["annotation_files"][0]).name
+    print(annotations_filename)
+
+    # Define transforms for inference
+    inference_transforms = transforms.Compose(
+        [
+            transforms.ToImage(),
+            transforms.ToDtype(torch.float32, scale=True),
+        ]
+    )
+
+    # Create COCO dataset and split
+    dataset_coco = create_coco_dataset(
+        images_dir=Path(dataset_dir) / "frames",
+        annotations_file=annotations_dir / annotations_filename,
+        composed_transform=inference_transforms,
+    )
+
+    # Split dataset like in crabs repo
+    train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+        dataset_coco,
+        seed_n=cli_args["seed_n"],
+        config=config,
+    )
+
+    # ------------------------------------
+
+    # Run detection on validation dataset
+    detections_ds = run_detector_on_dataset(
+        model=model,
+        dataset=val_dataset,
+        device=device,
+    )
+
+    # ------------------------------------
+    # Add attributes
+    detections_ds.attrs["model_name"] = "fasterrcnn_resnet50_fpn_v2"
+    detections_ds.attrs["model_path"] = trained_model_path
+    detections_ds.attrs["config"] = json.dumps(config, indent=2)
+    detections_ds.attrs["cli_args"] = json.dumps(cli_args, indent=2)
+    detections_ds.attrs["dataset_dir"] = str(dataset_dir)
+    detections_ds.attrs["annotations_file"] = str(
+        annotations_dir / annotations_filename
+    )
+    detections_ds.attrs["seed_n"] = cli_args["seed_n"]
+    detections_ds.attrs["coco_crabs_dataset_split"] = "val"
+
+    # ------------------------------------
+    # Save dataset
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    detections_ds.to_netcdf(
+        output_dir
+        / f"{model_key}_detections_val_set_seed_{cli_args['seed_n']}_{timestamp}.nc"
+    )
+
+
+# %%
+# # reshape
+# detections_per_validation_sample = {}
+# for val_idx in range(len(val_dataset)):
+#     detections_dict = detections_dict_per_sample[val_idx]
+#     bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
+
+#     detections_per_validation_sample[val_idx] = {
+#         "bbox_xyxy": bboxes_xyxy,
+#         "bbox_centroids": (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2,
+#         "bbox_shapes": bboxes_xyxy[:, 2:4] - bboxes_xyxy[:, 0:2],
+#         "bbox_confidences": detections_dict["scores"].cpu().numpy(),
+#         "bbox_labels": detections_dict["labels"].cpu().numpy(),
+#     }
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%
+# %%time
+# Use dataloader to run detection on validation set
+# val_dataloader = torch.utils.data.DataLoader(
+#     val_dataset,
+#     batch_size=8,
+#     shuffle=False,
+#     collate_fn=collate_fn_varying_n_bboxes,
+# )
+
+# # Run detection on dataloader
+# detections_dict_per_batch = run_detector_on_dataloader(
+#     model=model,
+#     dataloader=val_dataloader,
+#     device=device,
+# )

From 6af309dede068923ea9570b692f46d008ca6eefd Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:21:18 +0100
Subject: [PATCH 21/72] Key detections dict by image_id, rather than index in
 input dataset

---
 ethology/detectors/inference.py               | 32 ++++++------
 .../notebook_run_detection_on_dataset.py      | 51 +++++++++++--------
 2 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 6ea34337..463989a2 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -16,17 +16,17 @@ def _pad_sequence_along_detections_dim(
     )
 
 
-def _detections_per_sample_as_ds(
-    detections_per_sample: dict,
+def _detections_per_image_id_as_ds(
+    detections_per_image_id: dict,
 ) -> xr.Dataset:
     """Reshape detections per sample as xarray dataset."""
     # Get coordinates
-    list_image_id_coords = list(detections_per_sample.keys())
+    list_image_id_coords = list(detections_per_image_id.keys())
     list_space_coords = ["x", "y"]
     max_n_detections_per_image = max(
         [
             detections["boxes"].shape[0]
-            for detections in detections_per_sample.values()
+            for detections in detections_per_image_id.values()
         ]
     )
 
@@ -46,28 +46,28 @@ def _detections_per_sample_as_ds(
             + detections["boxes"].cpu().numpy()[:, 2:4]
         )
         * 0.5
-        for detections in detections_per_sample.values()
+        for detections in detections_per_image_id.values()
     ]
 
     list_shape_arrays = [
         detections["boxes"].cpu().numpy()[:, 2:4]
         - detections["boxes"].cpu().numpy()[:, 0:2]
-        for detections in detections_per_sample.values()
+        for detections in detections_per_image_id.values()
     ]
 
     list_confidence_arrays = [
         detections["scores"].cpu().numpy()  # .reshape(-1, 1)
-        for detections in detections_per_sample.values()
+        for detections in detections_per_image_id.values()
     ]
 
     list_label_arrays = [
         detections["labels"].cpu().numpy()  # .reshape(-1, 1)
-        for detections in detections_per_sample.values()
+        for detections in detections_per_image_id.values()
     ]
 
     # Define arrays to create
     arrays_dict = {
-        "centroids": {
+        "centroids": {  # --> change to position
             "data": list_centroid_arrays,
             "coords": coords_dict,
             "pad_value": np.nan,
@@ -122,7 +122,7 @@ def run_detector_on_dataset(
     """Run detection on each sample of a dataset.
 
     Note that the dataset transforms are applied to the sampled images.
-    The output is a dictionary with the detections per sample as a dictionary.
+    The output is a dictionary with the detections per image_id as a dictionary.
     The detections dictionary has the following keys:
     - "boxes": tensor of shape [N, 4]
     - "scores": tensor of shape [N]
@@ -132,8 +132,8 @@ def run_detector_on_dataset(
     model.eval()
 
     # Run detection
-    detections_per_sample = {}
-    for idx, (image, _annotations) in enumerate(dataset):
+    detections_per_image_id = {}
+    for image, annotations in dataset:
         # Place image tensor on device and add batch dimension
         image = image.to(device)[None]  # [1, C, H, W]
 
@@ -141,11 +141,13 @@ def run_detector_on_dataset(
         with torch.no_grad():
             detections = model(image)[0]  # select single batch dimension
 
-        # Add to dict
-        detections_per_sample[idx] = detections
+        # Add to dict with key = image_id
+        detections_per_image_id[annotations["image_id"]] = detections
 
     # Format as xarray dataset
-    detections_dataset = _detections_per_sample_as_ds(detections_per_sample)
+    detections_dataset = _detections_per_image_id_as_ds(
+        detections_per_image_id
+    )
 
     return detections_dataset
 
diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index cc8e0619..8e6adc5f 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -37,25 +37,21 @@
     Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
 )
 
-# percentile is of bbox diagonal!
+# I pick seed 42 for each set of models
 models_dict = {
-    "above_0th": (
-        ml_runs_experiment_dir
-        / "f348d9d196934073bece1b877cbc4d38"
-        / "checkpoints"
-        / "last.ckpt"
-    ),
-    "above_5th": (
-        ml_runs_experiment_dir
-        / "e72e53b23df142ae859dd590798b4162"
-        / "checkpoints"
-        / "last.ckpt"
-    ),
+    "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
+    "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
+    "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
+    "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
+    "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
 }
 
 output_dir = Path(
     "/home/sminano/swc/project_ethology/remove_small_bboxes_inD_output"
 )
+# create output dir if it doesn't exist
+output_dir.mkdir(parents=True, exist_ok=True)
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set default device: CUDA if available, otherwise mps, otherwise CPU
@@ -108,9 +104,13 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Compute detections for each model
 
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
 for model_key in models_dict:
     # Retrieve model config and CLI args from mlflow
-    trained_model_path = str(models_dict["above_0th"])
+    trained_model_path = str(
+        models_dict[model_key] / "checkpoints" / "last.ckpt"
+    )
 
     mlflow_params = read_mlflow_params(trained_model_path)
     config = read_config_from_mlflow_params(mlflow_params)
@@ -126,10 +126,6 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     model.eval()
 
     # ------------------------------------
-    # Create COCO dataset
-    annotations_filename = Path(cli_args["annotation_files"][0]).name
-    print(annotations_filename)
-
     # Define transforms for inference
     inference_transforms = transforms.Compose(
         [
@@ -138,7 +134,11 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
         ]
     )
 
-    # Create COCO dataset and split
+    # Create COCO dataset
+    annotations_filename = Path(cli_args["annotation_files"][0]).name
+    print(annotations_filename)
+    print(f"Seed: {cli_args['seed_n']}")
+
     dataset_coco = create_coco_dataset(
         images_dir=Path(dataset_dir) / "frames",
         annotations_file=annotations_dir / annotations_filename,
@@ -162,7 +162,7 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     )
 
     # ------------------------------------
-    # Add attributes
+    # Add attributes to detections dataset
     detections_ds.attrs["model_name"] = "fasterrcnn_resnet50_fpn_v2"
     detections_ds.attrs["model_path"] = trained_model_path
     detections_ds.attrs["config"] = json.dumps(config, indent=2)
@@ -175,13 +175,20 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     detections_ds.attrs["coco_crabs_dataset_split"] = "val"
 
     # ------------------------------------
-    # Save dataset
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Save detections dataset and evaluation dataset
+    # Save detections dataset
     detections_ds.to_netcdf(
         output_dir
         / f"{model_key}_detections_val_set_seed_{cli_args['seed_n']}_{timestamp}.nc"
     )
 
+    # # Save evaluation dataset with pickle
+    # with open(
+    #     output_dir
+    #     / f"{model_key}_evaluation_val_set_seed_{cli_args['seed_n']}_{timestamp}.pkl",
+    #     "wb",
+    # ) as f:
+    #     pickle.dump(val_dataset, f)
 
 # %%
 # # reshape

From 66dd2cf287c516af2c97457fb4da174d6d79ed8b Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:23:03 +0100
Subject: [PATCH 22/72] Update notebooks

---
 notebooks/notebook_bin_detections.py          | 559 ----------------
 .../notebook_evaluate_binned_performance.py   | 620 ++++++++++++++++++
 notebooks/notebook_mlflow_plots.py            | 214 ++++++
 3 files changed, 834 insertions(+), 559 deletions(-)
 delete mode 100644 notebooks/notebook_bin_detections.py
 create mode 100644 notebooks/notebook_evaluate_binned_performance.py
 create mode 100644 notebooks/notebook_mlflow_plots.py

diff --git a/notebooks/notebook_bin_detections.py b/notebooks/notebook_bin_detections.py
deleted file mode 100644
index 1465b6d0..00000000
--- a/notebooks/notebook_bin_detections.py
+++ /dev/null
@@ -1,559 +0,0 @@
-"""Run detection on a Pytorch dataset and export results as a movement dataset.
-
-A script to run detection only (no tracking) on a Pytorch dataset and
-export the results in a format that can be loaded in movement napari widget.
-"""
-
-# %%
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import torch
-import torchvision.transforms.v2 as transforms
-import xarray as xr
-from pycocotools.coco import COCO
-from torch.utils.data import random_split
-
-from ethology.datasets.create import create_coco_dataset
-from ethology.detectors.evaluate import evaluate_detections_hungarian
-from ethology.detectors.inference import run_detector_on_dataset
-from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
-from ethology.mlflow import (
-    read_cli_args_from_mlflow_params,
-    read_config_from_mlflow_params,
-    read_mlflow_params,
-)
-
-# Set xarray options
-xr.set_options(display_expand_attrs=False)
-
-# %matplotlib widget
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Input data - in domain
-dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
-
-experiment_ID = "617393114420881798"
-ml_runs_experiment_dir = (
-    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
-)
-
-
-# percentile is of bbox diagonal!
-models_dict = {
-    "above_0th_percentile_seed_42": (
-        ml_runs_experiment_dir
-        / "f348d9d196934073bece1b877cbc4d38"
-        / "checkpoints"
-        / "last.ckpt"
-    ),
-    "above_5th_percentile_seed_42": (
-        ml_runs_experiment_dir
-        / "e72e53b23df142ae859dd590798b4162"
-        / "checkpoints"
-        / "last.ckpt"
-    ),
-}
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Compute bins using full GT annotations
-# We bin the size of the bbox diagonal
-
-full_gt_annotations_file = annotations_dir / "VIA_JSON_combined_coco_gen.json"
-coco_full_gt = COCO(str(full_gt_annotations_file))
-
-# compute diagonals for each gt annotation
-gt_bboxes_diagonals = [
-    np.sqrt(
-        annot["bbox"][2] ** 2 + annot["bbox"][3] ** 2
-    )  # bbox is xywh in COCO
-    for annot in coco_full_gt.dataset["annotations"]
-]
-
-# compute percentiles of diagonals
-gt_diagonal_percentiles = np.percentile(
-    gt_bboxes_diagonals, np.arange(0, 105, 5)
-)
-
-# define labels for bins
-bin_labels = [
-    f"{gt_diagonal_percentiles[i]:.0f}-{gt_diagonal_percentiles[i + 1]:.0f}"
-    for i in range(gt_diagonal_percentiles.shape[0] - 1)
-]
-
-print(bin_labels)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-def split_dataset_crab_repo(dataset_coco, seed_n):
-    """Split dataset like in crabs repo."""
-    # Split data into train and test-val sets
-    rng_train_split = torch.Generator().manual_seed(seed_n)
-    rng_val_split = torch.Generator().manual_seed(seed_n)
-
-    # Split train and test-val sets
-    train_dataset, test_val_dataset = random_split(
-        dataset_coco,
-        [config["train_fraction"], 1 - config["train_fraction"]],
-        generator=rng_train_split,
-    )
-
-    # Split test/val sets from the remainder
-    test_dataset, val_dataset = random_split(
-        test_val_dataset,
-        [
-            1 - config["val_over_test_fraction"],
-            config["val_over_test_fraction"],
-        ],
-        generator=rng_val_split,
-    )
-
-    print(f"Seed: {seed_n}")
-    print(f"Number of training samples: {len(train_dataset)}")  # images
-    print(f"Number of validation samples: {len(val_dataset)}")  # images
-    print(f"Number of test samples: {len(test_dataset)}")  # images
-
-    return train_dataset, val_dataset, test_dataset
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Set default device: CUDA if available, otherwise mps, otherwise CPU
-device = torch.device(
-    "cuda"
-    if torch.cuda.is_available()
-    else "mps"
-    if torch.backends.mps.is_available()
-    else "cpu"
-)
-
-print(f"Using device: {device}")
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Retrieve model config and CLI args from mlflow
-
-trained_model_path = str(models_dict["above_0th_percentile_seed_42"])
-
-mlflow_params = read_mlflow_params(trained_model_path)
-config = read_config_from_mlflow_params(mlflow_params)
-cli_args = read_cli_args_from_mlflow_params(mlflow_params)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Load model
-
-model = load_fasterrcnn_resnet50_fpn_v2(
-    trained_model_path,
-    num_classes=config["num_classes"],
-    device=device,
-)
-
-# Set to evaluation mode
-model.eval()
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Create COCO dataset
-annotations_filename = Path(cli_args["annotation_files"][0]).name
-print(annotations_filename)
-
-inference_transforms = transforms.Compose(
-    [
-        transforms.ToImage(),
-        transforms.ToDtype(torch.float32, scale=True),
-    ]
-)
-
-
-dataset_coco = create_coco_dataset(
-    images_dir=Path(dataset_dir) / "frames",
-    annotations_file=annotations_dir / annotations_filename,
-    composed_transform=inference_transforms,
-)
-
-
-# Split dataset like in crabs repo
-train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
-    dataset_coco,
-    seed_n=cli_args["seed_n"],
-)
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Create dataloader
-
-# dataloader = torch.utils.data.DataLoader(
-#     val_dataset,
-#     batch_size=1,
-#     shuffle=True,
-# )
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run detection on validation set
-detections_dict_per_sample = run_detector_on_dataset(
-    model=model,
-    dataset=val_dataset,
-    device=device,
-)
-
-# save as movement bbox dataset / netcdf?
-
-# reshape
-detections_per_validation_sample = {}
-for val_idx in range(len(val_dataset)):
-    detections_dict = detections_dict_per_sample[val_idx]
-    bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
-
-    detections_per_validation_sample[val_idx] = {
-        "bbox_xyxy": bboxes_xyxy,
-        "bbox_centroids": (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2,
-        "bbox_shapes": bboxes_xyxy[:, 2:4] - bboxes_xyxy[:, 0:2],
-        "bbox_confidences": detections_dict["scores"].cpu().numpy(),
-        "bbox_labels": detections_dict["labels"].cpu().numpy(),
-    }
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate detections using Hungarian algorithm and create dataframes
-
-iou_threshold = 0.1
-
-# Collect all data efficiently
-list_pred_subtables = []
-list_gt_subtables = []
-
-
-# Loop over validation set
-for val_idx, (_img, annotations) in enumerate(val_dataset):
-    # Get predictions for this image
-    pred_dict = detections_per_validation_sample[val_idx]
-    pred_bboxes = pred_dict["bbox_xyxy"]
-
-    # Get ground truth
-    gt_bboxes = annotations["boxes"].cpu().numpy()
-
-    # Evaluate detections
-    tp, fp, md = evaluate_detections_hungarian(
-        pred_bboxes, gt_bboxes, iou_threshold
-    )
-
-    # Calculate bboxes diagonals
-    pred_bboxes_width = pred_bboxes[:, 2] - pred_bboxes[:, 0]
-    pred_bboxes_height = pred_bboxes[:, 3] - pred_bboxes[:, 1]
-    pred_diagonals = np.sqrt(pred_bboxes_width**2 + pred_bboxes_height**2)
-
-    gt_bboxes_width = gt_bboxes[:, 2] - gt_bboxes[:, 0]
-    gt_bboxes_height = gt_bboxes[:, 3] - gt_bboxes[:, 1]
-    gt_diagonals = np.sqrt(gt_bboxes_width**2 + gt_bboxes_height**2)
-
-    # Create prediction subtable
-    pred_data = {
-        "prediction_ID": [
-            f"pred_{val_idx}_{i}" for i in range(len(pred_bboxes))
-        ],
-        "image_ID": annotations["image_id"],
-        "val_batch_idx": val_idx,
-        "confidence": pred_dict["bbox_confidences"],
-        "TP": tp,
-        "FP": fp,
-        "bbox_diagonal": pred_diagonals,
-    }
-    list_pred_subtables.append(pd.DataFrame(pred_data))
-
-    # Create ground truth subtable
-    gt_data = {
-        "gt_annotation_ID": [
-            f"gt_{val_idx}_{i}" for i in range(len(gt_bboxes))
-        ],
-        "image_ID": annotations["image_id"],
-        "val_batch_idx": val_idx,
-        "missed_detection": md,
-        "bbox_diagonal": gt_diagonals,
-    }
-    list_gt_subtables.append(pd.DataFrame(gt_data))
-
-# Concatenate all dataframes
-predictions_df = pd.concat(list_pred_subtables, ignore_index=True)
-gt_annotations_df = pd.concat(list_gt_subtables, ignore_index=True)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Check average precision and recall on validation set
-
-precision_recall_df = pd.DataFrame(
-    {
-        "TP": predictions_df.groupby("image_ID")["TP"].sum(),
-        "FP": predictions_df.groupby("image_ID")["FP"].sum(),
-        "MD": gt_annotations_df.groupby("image_ID")["missed_detection"].sum(),
-        "GT": gt_annotations_df.groupby("image_ID").count()[
-            "gt_annotation_ID"
-        ],
-        "val_batch_idx": predictions_df.groupby("image_ID")[
-            "val_batch_idx"
-        ].first(),
-    }
-)
-
-# sort by val_batch_idx
-precision_recall_df = precision_recall_df.sort_values(by="val_batch_idx")
-precision_recall_df = precision_recall_df.reset_index()
-
-precision_recall_df["precision"] = precision_recall_df["TP"] / (
-    precision_recall_df["TP"] + precision_recall_df["FP"]
-)
-precision_recall_df["recall"] = (
-    precision_recall_df["TP"] / precision_recall_df["GT"]
-)
-
-print(precision_recall_df)
-print(f"Average precision: {precision_recall_df['precision'].mean()}")
-print(f"Average recall: {precision_recall_df['recall'].mean()}")
-
-
-# all annotations:
-# Average precision: 0.9456786718983294
-# Average recall: 0.8494677009613534
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Discretize predictions based on confidence
-bin_edges = np.arange(0, 1.01, 0.05)
-predictions_df["confidence_bins"] = pd.cut(
-    predictions_df["confidence"],
-    bins=bin_edges,
-)
-
-precision_per_confidence_bin = predictions_df.groupby(
-    "confidence_bins", observed=False
-)["TP"].sum()
-total_detections_per_confidence_bin = (
-    predictions_df["confidence_bins"].value_counts().sort_index()
-)
-
-calibration_df = pd.DataFrame(
-    {
-        "precision": precision_per_confidence_bin
-        / total_detections_per_confidence_bin,
-        "total_detections": total_detections_per_confidence_bin,
-        "TP": precision_per_confidence_bin,
-    }
-)
-
-# Plot as bar chart
-fig, ax = plt.subplots(1, 1, figsize=(10, 6))
-calibration_df["precision"].plot(
-    kind="bar",
-    figsize=(12, 6),
-    color=["skyblue"],
-    ax=ax,
-)
-
-ax.plot(
-    np.arange(len(calibration_df)),  # bin indices
-    (bin_edges[:-1] + bin_edges[1:]) / 2,  # perfect calibration
-    color="red",
-    linewidth=2,
-    marker="o",
-    label="Perfect calibration",
-)
-
-ax.set_title(f"Calibration curve (n={precision_per_confidence_bin.sum()})")
-ax.set_xlabel("confidence")
-ax.set_ylabel("Precision")
-ax.tick_params(axis="x", rotation=45)
-ax.grid(True, alpha=0.3)
-
-plt.tight_layout()
-plt.show()
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Discretize annotations based on diagonal
-
-
-predictions_df["diagonal_bins"] = pd.cut(
-    predictions_df["bbox_diagonal"],
-    bins=gt_diagonal_percentiles,  # same bins for predictions and gt
-    labels=bin_labels,
-    include_lowest=True,  # --- change
-    right=False,
-)
-
-gt_annotations_df["diagonal_bins"] = pd.cut(
-    gt_annotations_df["bbox_diagonal"],
-    bins=gt_diagonal_percentiles,  # same bins for predictions and gt
-    labels=bin_labels,
-    include_lowest=True,
-    right=False,
-)
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Count detections in each diagonal bin
-# Is GT really that balanced??
-predictions_per_diagonal_bin = (
-    predictions_df["diagonal_bins"].value_counts().sort_index()
-)
-gt_per_diagonal_bin = (
-    gt_annotations_df["diagonal_bins"].value_counts().sort_index()
-)
-
-comparison_df = pd.DataFrame(
-    {
-        "Predictions": predictions_per_diagonal_bin,
-        "Ground Truth": gt_per_diagonal_bin,
-    }
-)
-
-# Plot as bar chart
-plt.figure(figsize=(10, 6))
-comparison_df.plot(
-    kind="bar",
-    figsize=(12, 6),
-    color=["skyblue", "lightcoral"],
-    stacked=False,
-)
-plt.title("Detection Counts by Diagonal Bins Validation Set")
-plt.xlabel("Diagonal (pixels)")
-plt.ylabel("Number of Detections")
-plt.xticks(rotation=45)
-plt.grid(True, alpha=0.3)
-plt.tight_layout()
-plt.show()
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Count true positives per bin
-
-true_positives_counts = pd.DataFrame(
-    {
-        "Predictions": predictions_per_diagonal_bin,
-        "True Positives": predictions_df.loc[
-            predictions_df["TP"], "diagonal_bins"
-        ]
-        .value_counts()
-        .sort_index(),
-    }
-)
-
-true_positives_counts["precision"] = (
-    true_positives_counts["True Positives"]
-    / true_positives_counts["Predictions"]
-)
-
-# Plot as bar chart
-fig, ax = plt.subplots(1, 1, figsize=(10, 6))
-true_positives_counts.loc[:, ["Predictions", "True Positives"]].plot(
-    kind="bar",
-    ax=ax,
-    figsize=(12, 6),
-    color=["skyblue", "blue"],
-    stacked=False,
-)
-ax.set_title("Counts per Diagonal Bin Validation Set")
-ax.set_xlabel("Diagonal (pixels)")
-ax.set_ylabel("Number of Detections")
-ax.tick_params(axis="x", rotation=45)
-ax.set_ylim(0.0, 325)
-ax.grid(True, alpha=0.3)
-
-
-# add line plot for precision on right y-axis
-ax2 = ax.twinx()
-ax2.plot(
-    range(len(true_positives_counts)),
-    true_positives_counts["precision"],
-    color="red",
-    marker="o",
-    label="Precision",
-    linewidth=2,
-)
-ax2.set_ylabel("Precision", color="red")
-ax2.tick_params(axis="y", labelcolor="red")
-ax2.set_ylim(0.0, 1.00)  # Precision is between 0 and 1
-
-plt.tight_layout()
-plt.show()
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Count missed detections per bin
-
-missed_detections_counts = pd.DataFrame(
-    {
-        "Ground Truth": gt_per_diagonal_bin,
-        "Matched Ground Truth": gt_annotations_df.loc[
-            ~gt_annotations_df["missed_detection"], "diagonal_bins"
-        ]
-        .value_counts()
-        .sort_index(),
-    }
-)
-
-missed_detections_counts["recall"] = (
-    missed_detections_counts["Matched Ground Truth"]
-    / missed_detections_counts["Ground Truth"]
-)
-
-# Plot as bar chart
-fig, ax = plt.subplots(1, 1, figsize=(10, 6))
-missed_detections_counts.loc[:, ["Ground Truth", "Matched Ground Truth"]].plot(
-    kind="bar",
-    ax=ax,
-    figsize=(12, 6),
-    color=["lightcoral", "green"],
-    stacked=False,
-)
-ax.set_title("Counts per Diagonal Bin Validation Set")
-ax.set_xlabel("Diagonal (pixels)")
-ax.set_ylabel("Number of Detections")
-ax.tick_params(axis="x", rotation=45)
-ax.set_ylim(0.0, 325)
-ax.grid(True, alpha=0.3)
-
-
-# add line plot for recall on right y-axis
-ax2 = ax.twinx()
-ax2.plot(
-    range(len(missed_detections_counts)),
-    missed_detections_counts["recall"],
-    color="blue",
-    marker="o",
-    linewidth=2,
-)
-ax2.tick_params(axis="y", labelcolor="blue")
-ax2.set_ylabel("Recall", color="blue")
-ax2.set_ylim(0.0, 1.00)  # Recall is between 0 and 1
-
-plt.tight_layout()
-plt.show()
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Image id histogram
-
-detections_per_image_id = pd.DataFrame(
-    {
-        "Predictions": predictions_df.groupby("image_ID").count()[
-            "prediction_ID"
-        ],
-        "Ground Truth": gt_annotations_df.groupby("image_ID").count()[
-            "gt_annotation_ID"
-        ],
-        "True Positives": predictions_df.groupby("image_ID")["TP"].sum(),
-    }
-)
-
-# Plot as bar chart
-plt.figure(figsize=(10, 6))
-detections_per_image_id.plot(
-    kind="bar",
-    figsize=(12, 6),
-    color=["skyblue", "lightcoral", "green"],
-    stacked=False,
-)
-plt.title("Detections per Image ID")
-plt.xlabel("Image ID")
-plt.ylabel("Number of Detections")
-plt.xticks(rotation=45)
-plt.grid(True, alpha=0.3)
-plt.tight_layout()
-plt.show()
-
-# %%
diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
new file mode 100644
index 00000000..9f914e29
--- /dev/null
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -0,0 +1,620 @@
+"""Run detection on a Pytorch dataset and export results as a movement dataset.
+
+A script to run detection only (no tracking) on a Pytorch dataset and
+export the results in a format that can be loaded in movement napari widget.
+"""
+
+# %%
+import pickle
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from pycocotools.coco import COCO
+from torch.utils.data import random_split
+
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.evaluate import evaluate_detections_hungarian
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
+from torchvision.utils import draw_bounding_boxes
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+# %matplotlib widget
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data - in domain
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+
+
+list_models = [
+    "above_0th",
+    "above_1st",
+    "above_5th",
+    "above_10th",
+    "above_25th",
+    "above_50th",
+]
+timestamp_ref = "20250717_115247"
+predictions_dir = Path(
+    "/home/sminano/swc/project_ethology/remove_small_bboxes_inD_output"
+)
+
+flag_use_full_gt = True
+full_gt_annotations_file = (
+    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Helper functions
+
+
+def split_dataset_crab_repo(dataset_coco, seed_n, config):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
+def compute_pred_gt_tables(iou_threshold, ds_predictions, val_dataset):
+    list_pred_subtables = []
+    list_gt_subtables = []
+
+    # Loop over validation set
+    for val_idx, (_img, annotations) in enumerate(val_dataset):
+        # Get image_id from validation set
+        image_id = annotations["image_id"]
+
+        # Get predictions for this image
+        centroids = ds_predictions.centroids.isel(image_id=val_idx).T.values
+        shape = ds_predictions.shape.isel(image_id=val_idx).T.values
+        confidence = ds_predictions.confidence.isel(image_id=val_idx).T.values
+        slc_non_nan = ~np.isnan(centroids).any(axis=1)
+
+        # format predictions as xyxy
+        pred_bboxes = np.concatenate(
+            [
+                centroids[slc_non_nan] - (shape[slc_non_nan] / 2),
+                centroids[slc_non_nan] + (shape[slc_non_nan] / 2),
+            ],
+            axis=1,
+        )
+
+        # Get ground truth from input dataset
+        gt_bboxes = annotations["boxes"].cpu().numpy()
+
+        # Evaluate detections
+        tp, fp, md = evaluate_detections_hungarian(
+            pred_bboxes, gt_bboxes, iou_threshold
+        )
+
+        # Calculate bboxes diagonals
+        pred_bboxes_width = pred_bboxes[:, 2] - pred_bboxes[:, 0]
+        pred_bboxes_height = pred_bboxes[:, 3] - pred_bboxes[:, 1]
+        pred_diagonals = np.sqrt(pred_bboxes_width**2 + pred_bboxes_height**2)
+
+        gt_bboxes_width = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+        gt_bboxes_height = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+        gt_diagonals = np.sqrt(gt_bboxes_width**2 + gt_bboxes_height**2)
+
+        # Create prediction subtable
+        pred_data = {
+            "prediction_ID": [
+                f"pred_{val_idx}_{i}" for i in range(len(pred_bboxes))
+            ],
+            "image_ID": image_id,
+            "val_batch_idx": val_idx,
+            "confidence": confidence[slc_non_nan],
+            "TP": tp,
+            "FP": fp,
+            "bbox_diagonal": pred_diagonals,
+        }
+        list_pred_subtables.append(pd.DataFrame(pred_data))
+
+        # Create ground truth subtable
+        gt_data = {
+            "gt_annotation_ID": [
+                f"gt_{val_idx}_{i}" for i in range(len(gt_bboxes))
+            ],
+            "image_ID": image_id,
+            "val_batch_idx": val_idx,
+            "missed_detection": md,
+            "bbox_diagonal": gt_diagonals,
+        }
+        list_gt_subtables.append(pd.DataFrame(gt_data))
+
+    # Concatenate all dataframes
+    predictions_df = pd.concat(list_pred_subtables, ignore_index=True)
+    gt_annotations_df = pd.concat(list_gt_subtables, ignore_index=True)
+
+    return predictions_df, gt_annotations_df
+
+
+def compute_average_precision_recall_per_image_id(
+    predictions_df, gt_annotations_df
+):
+    precision_recall_df = pd.DataFrame(
+        {
+            "TP": predictions_df.groupby("image_ID")["TP"].sum(),
+            "FP": predictions_df.groupby("image_ID")["FP"].sum(),
+            "MD": gt_annotations_df.groupby("image_ID")[
+                "missed_detection"
+            ].sum(),
+            "GT": gt_annotations_df.groupby("image_ID").count()[
+                "gt_annotation_ID"
+            ],
+            "val_batch_idx": predictions_df.groupby("image_ID")[
+                "val_batch_idx"
+            ].first(),
+        }
+    )
+
+    # sort by val_batch_idx
+    precision_recall_df = precision_recall_df.sort_values(by="val_batch_idx")
+    precision_recall_df = precision_recall_df.reset_index()
+
+    precision_recall_df["precision"] = precision_recall_df["TP"] / (
+        precision_recall_df["TP"] + precision_recall_df["FP"]
+    )
+    precision_recall_df["recall"] = (
+        precision_recall_df["TP"] / precision_recall_df["GT"]
+    )
+
+    # print(precision_recall_df)
+    print(f"Average precision: {precision_recall_df['precision'].mean()}")
+    print(f"Average recall: {precision_recall_df['recall'].mean()}")
+
+    return precision_recall_df
+
+
+def discretize_based_on_bbox_diagonal(
+    predictions_df,
+    gt_annotations_df,
+    gt_diagonal_percentile_values,
+    bin_labels,
+):
+    predictions_df["diagonal_bins"] = pd.cut(
+        predictions_df["bbox_diagonal"],
+        bins=gt_diagonal_percentile_values,  # same bins for predictions and gt
+        labels=bin_labels,
+        include_lowest=False,
+        right=True,
+    )
+
+    gt_annotations_df["diagonal_bins"] = pd.cut(
+        gt_annotations_df["bbox_diagonal"],
+        bins=gt_diagonal_percentile_values,  # same bins for predictions and gt
+        labels=bin_labels,
+        include_lowest=False,
+        right=True,
+    )
+
+    return predictions_df, gt_annotations_df
+
+
+def plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, model_key):
+    "Plot true positives per diagonal bin"
+    true_positives_counts = pd.DataFrame(
+        {
+            "Predictions": predictions_per_diagonal_bin,
+            "True Positives": predictions_df.loc[
+                predictions_df["TP"], "diagonal_bins"
+            ]
+            .value_counts()
+            .sort_index(),
+        }
+    )
+
+    true_positives_counts["precision"] = (
+        true_positives_counts["True Positives"]
+        / true_positives_counts["Predictions"]
+    )
+
+    # Plot as bar chart
+    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+    true_positives_counts.loc[:, ["Predictions", "True Positives"]].plot(
+        kind="bar",
+        ax=ax,
+        figsize=(12, 6),
+        color=["skyblue", "blue"],
+        stacked=False,
+    )
+    ax.set_title(f"model trained on annotations {model_key} percentile")
+    ax.set_xlabel("diagonal (pixels)")
+    ax.set_ylabel("count")
+    ax.tick_params(axis="x", rotation=45)
+    ax.set_ylim(0.0, 425)
+    ax.grid(True, alpha=0.3)
+
+
+    # add line plot for precision on right y-axis
+    ax2 = ax.twinx()
+    ax2.plot(
+        range(len(true_positives_counts)),
+        true_positives_counts["precision"],
+        color="red",
+        marker="o",
+        label="precision",
+        linewidth=2,
+    )
+    ax2.set_ylabel("precision", color="red")
+    ax2.tick_params(axis="y", labelcolor="red")
+    ax2.set_ylim(0.0, 1.00)  # Precision is between 0 and 1
+
+    # add reference line at 0.96
+    ax2.axhline(y=0.96, color="red", linestyle="--", linewidth=1)
+
+    # put legend on left
+    ax.legend(loc="upper left")
+    plt.tight_layout()
+    plt.show()
+
+
+def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model_key):
+
+    missed_detections_counts = pd.DataFrame(
+        {
+            "Ground Truth": gt_per_diagonal_bin,
+            "Matched Ground Truth": gt_annotations_df.loc[
+                ~gt_annotations_df["missed_detection"], "diagonal_bins"
+            ]
+            .value_counts()
+            .sort_index(),
+        }
+    )
+
+    missed_detections_counts["recall"] = (
+        missed_detections_counts["Matched Ground Truth"]
+        / missed_detections_counts["Ground Truth"]
+    )
+
+    # Plot as bar chart
+    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+    missed_detections_counts.loc[:, ["Ground Truth", "Matched Ground Truth"]].plot(
+        kind="bar",
+        ax=ax,
+        figsize=(12, 6),
+        color=["lightcoral", "green"],
+        stacked=False,
+    )
+    ax.set_title(f"model trained on annotations {model_key} percentile")
+    ax.set_xlabel("Diagonal (pixels)")
+    ax.set_ylabel("Number of Detections")
+    ax.tick_params(axis="x", rotation=45)
+    ax.set_ylim(0.0, 400)
+    ax.grid(True, alpha=0.3)
+
+
+    # add line plot for recall on right y-axis
+    ax2 = ax.twinx()
+    ax2.plot(
+        range(len(missed_detections_counts)),
+        missed_detections_counts["recall"],
+        color="blue",
+        marker="o",
+        linewidth=2,
+    )
+    ax2.tick_params(axis="y", labelcolor="blue")
+    ax2.set_ylabel("Recall", color="blue")
+    ax2.set_ylim(0.0, 1.00)  # Recall is between 0 and 1
+
+
+    # add reference line at 0.85
+    ax2.axhline(y=0.85, color="blue", linestyle="--", linewidth=1)
+
+    plt.tight_layout()
+    plt.show()
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute bins using full GT annotations
+# We bin the size of the bbox diagonal
+
+coco_full_gt = COCO(str(full_gt_annotations_file))
+
+# compute diagonals for each gt annotation
+gt_bboxes_diagonals = np.array(
+    [
+        np.sqrt(
+            annot["bbox"][2] ** 2 + annot["bbox"][3] ** 2
+        )  # bbox is xywh in COCO
+        for annot in coco_full_gt.dataset["annotations"]
+    ]
+)
+
+# compute percentiles of diagonals
+gt_diagonal_percentiles = np.arange(0, 105, 5)
+gt_diagonal_percentile_values = np.percentile(
+    gt_bboxes_diagonals, gt_diagonal_percentiles
+)
+
+# define labels for bins
+bin_labels = [
+    f"{gt_diagonal_percentile_values[i]:.0f}-{gt_diagonal_percentile_values[i + 1]:.0f}"
+    for i in range(gt_diagonal_percentile_values.shape[0] - 1)
+]
+
+print(gt_diagonal_percentiles)
+print(gt_diagonal_percentile_values)
+print(bin_labels)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Plot histogram of gt bboxes diagonals with bins
+
+fig, ax1 = plt.subplots(figsize=(10, 6))
+
+# histogram
+ax1.hist(gt_bboxes_diagonals, bins=100, color="skyblue")
+
+# add vertical lines the bin labels
+for i, bin_label in enumerate(bin_labels):
+    ax1.axvline(x=gt_diagonal_percentile_values[i], color="red", linestyle="-")
+    ax1.text(
+        gt_diagonal_percentile_values[i],
+        2125,
+        gt_diagonal_percentiles[i],
+        color="red",
+        ha="left",
+        va="bottom",
+        rotation=45,
+        fontsize=6.5,
+    )
+
+# ax1.set_title("GT bboxes diagonals")
+ax1.set_xlabel("diagonal (pixels)")
+ax1.set_ylabel("count")
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate all models
+
+
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+default_dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=full_gt_annotations_file,  # annotations_dir / annotations_filename,
+    composed_transform=inference_transforms,
+)
+
+iou_threshold = 0.1
+
+for model_key in list_models:
+    # ---------------------------------------
+    # Load predictions
+    ds_predictions = xr.open_dataset(
+        predictions_dir
+        / f"{model_key}_detections_val_set_seed_42_{timestamp_ref}.nc"
+    )
+
+    # ---------------------------------------
+    # Define GT annotations
+
+    trained_model_path = ds_predictions.attrs["model_path"]
+    mlflow_params = read_mlflow_params(trained_model_path)
+    config = read_config_from_mlflow_params(mlflow_params)
+    cli_args = read_cli_args_from_mlflow_params(mlflow_params)
+
+
+    # Create COCO dataset
+    # Fix for model trained on all annotations
+    # (VIA_JSON_combined_coco_gen has different image IDs than the rest)
+    if (
+        Path(cli_args["annotation_files"][0]).name
+        == "VIA_JSON_combined_coco_gen.json"
+    ):
+        dataset_coco = create_coco_dataset(
+            images_dir=Path(dataset_dir) / "frames",
+            annotations_file=annotations_dir / "VIA_JSON_combined_coco_gen.json",
+            composed_transform=inference_transforms,
+        )
+    else:
+        dataset_coco = default_dataset_coco
+
+
+    # Split dataset like in crabs repo
+    train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+        dataset_coco,
+        seed_n=cli_args["seed_n"],
+        config=config,
+    )
+
+    # ---------------------------------------
+    # Evaluate detections using Hungarian algorithm and create dataframes
+
+    predictions_df, gt_annotations_df = compute_pred_gt_tables(
+        iou_threshold, ds_predictions, val_dataset
+    )
+
+    # ---------------------------------------
+    # Check average precision and recall on validation set
+    precision_recall_df = compute_average_precision_recall_per_image_id(
+        predictions_df, gt_annotations_df
+    )
+
+    # all annotations:
+    # Average precision: 0.9456786718983294
+    # Average recall: 0.8494677009613534
+
+    # ---------------------------------------
+    # Discretize annotations based on bbox diagonal
+    predictions_df, gt_annotations_df = discretize_based_on_bbox_diagonal(
+        predictions_df,
+        gt_annotations_df,
+        gt_diagonal_percentile_values,
+        bin_labels,
+    )
+
+
+    #---------------------------------------
+    # Plot boxes in each diagonal bin in validation set
+    predictions_per_diagonal_bin = (
+        predictions_df["diagonal_bins"].value_counts().sort_index()
+    )
+    gt_per_diagonal_bin = (
+        gt_annotations_df["diagonal_bins"].value_counts().sort_index()
+    )
+
+    comparison_df = pd.DataFrame(
+        {
+            "Predictions": predictions_per_diagonal_bin,
+            "Ground Truth": gt_per_diagonal_bin,
+        }
+    )
+
+    # Plot as bar chart
+    plt.figure(figsize=(10, 6))
+    comparison_df.plot(
+        kind="bar",
+        figsize=(12, 6),
+        color=["skyblue", "lightcoral"],
+        stacked=False,
+    )
+    plt.ylim(0.0, 400)
+    plt.title(f"model trained on annotations {model_key} percentile")
+    plt.xlabel("diagonal (pixels)")
+    plt.ylabel("count")
+    plt.xticks(rotation=45)
+    plt.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.show()
+
+    #---------------------------------------
+    # Plot true positives per bin
+    plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, model_key)
+    
+
+    #---------------------------------------
+    # Plot missed detections per bin
+    plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model_key)
+
+
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Discretize predictions based on confidence
+bin_edges = np.arange(0, 1.01, 0.05)
+predictions_df["confidence_bins"] = pd.cut(
+    predictions_df["confidence"],
+    bins=bin_edges,
+)
+
+precision_per_confidence_bin = predictions_df.groupby(
+    "confidence_bins", observed=False
+)["TP"].sum()
+total_detections_per_confidence_bin = (
+    predictions_df["confidence_bins"].value_counts().sort_index()
+)
+
+calibration_df = pd.DataFrame(
+    {
+        "precision": precision_per_confidence_bin
+        / total_detections_per_confidence_bin,
+        "total_detections": total_detections_per_confidence_bin,
+        "TP": precision_per_confidence_bin,
+    }
+)
+
+# Plot as bar chart
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+calibration_df["precision"].plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue"],
+    ax=ax,
+)
+
+ax.plot(
+    np.arange(len(calibration_df)),  # bin indices
+    (bin_edges[:-1] + bin_edges[1:]) / 2,  # perfect calibration
+    color="red",
+    linewidth=2,
+    marker="o",
+    label="Perfect calibration",
+)
+
+ax.set_title(
+    f"{model_key} - calibration curve (n={precision_per_confidence_bin.sum()})"
+)
+ax.set_xlabel("confidence")
+ax.set_ylabel("Precision")
+ax.tick_params(axis="x", rotation=45)
+ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Image id histogram
+
+detections_per_image_id = pd.DataFrame(
+    {
+        "Predictions": predictions_df.groupby("image_ID").count()[
+            "prediction_ID"
+        ],
+        "Ground Truth": gt_annotations_df.groupby("image_ID").count()[
+            "gt_annotation_ID"
+        ],
+        "True Positives": predictions_df.groupby("image_ID")["TP"].sum(),
+    }
+)
+
+# Plot as bar chart
+plt.figure(figsize=(10, 6))
+detections_per_image_id.plot(
+    kind="bar",
+    figsize=(12, 6),
+    color=["skyblue", "lightcoral", "green"],
+    stacked=False,
+)
+plt.title("Detections per Image ID")
+plt.xlabel("Image ID")
+plt.ylabel("Number of Detections")
+plt.xticks(rotation=45)
+plt.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.show()
+
+# %%
+# %matplotlib widget
+# %%
diff --git a/notebooks/notebook_mlflow_plots.py b/notebooks/notebook_mlflow_plots.py
new file mode 100644
index 00000000..9a2ba9d7
--- /dev/null
+++ b/notebooks/notebook_mlflow_plots.py
@@ -0,0 +1,214 @@
+"""Run detection on a Pytorch dataset and export results as a movement dataset.
+
+A script to run detection only (no tracking) on a Pytorch dataset and
+export the results in a format that can be loaded in movement napari widget.
+"""
+
+# %%
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import xarray as xr
+from pycocotools.coco import COCO
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+# %matplotlib widget
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data - in domain
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+
+
+map_run_slurm_977884_jobID_to_percentile = {
+    "0": "0",
+    "1": "0",
+    "2": "0",
+    "3": "1",
+    "4": "1",
+    "5": "1",
+    "6": "5",
+    "7": "5",
+    "8": "5",
+    "9": "10",
+    "10": "10",
+    "11": "10",
+    "12": "25",
+    "13": "25",
+    "14": "25",
+    "15": "50",
+    "16": "50",
+    "17": "50",
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute bins using full GT annotations
+# We bin the size of the bbox diagonal
+
+full_gt_annotations_file = annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+coco_full_gt = COCO(str(full_gt_annotations_file))
+
+# compute diagonals for each gt annotation
+gt_bboxes_diagonals = np.array(
+    [
+        np.sqrt(
+            annot["bbox"][2] ** 2 + annot["bbox"][3] ** 2
+        )  # bbox is xywh in COCO
+        for annot in coco_full_gt.dataset["annotations"]
+    ]
+)
+
+# compute percentiles of diagonals
+gt_diagonal_percentiles = np.arange(0, 105, 5)
+gt_diagonal_percentile_values = np.percentile(
+    gt_bboxes_diagonals, gt_diagonal_percentiles
+)
+
+# define labels for bins
+bin_labels = [
+    f"{gt_diagonal_percentile_values[i]:.0f}-{gt_diagonal_percentile_values[i + 1]:.0f}"
+    for i in range(gt_diagonal_percentile_values.shape[0] - 1)
+]
+
+print(gt_diagonal_percentiles)
+print(gt_diagonal_percentile_values)
+print(bin_labels)
+
+# %%
+# Plot histogram of gt bboxes diagonals
+
+fig, ax1 = plt.subplots(figsize=(10, 6))
+
+# histogram
+ax1.hist(gt_bboxes_diagonals, bins=100, color="skyblue")
+
+# add vertical lines for a subset of the percentiles
+percentile_subset = [gt_diagonal_percentiles[i] for i in [0, 1, 2, 5, 10]]
+percentile_subset_values = [
+    gt_diagonal_percentile_values[i] for i in [0, 1, 2, 5, 10]
+]
+for i, percentile in enumerate(percentile_subset_values):
+    ax1.axvline(x=percentile, color="red", linestyle="-")
+    ax1.text(
+        percentile,
+        2125,
+        f"{percentile_subset[i]}%",
+        color="red",
+        ha="left",
+        va="bottom",
+    )
+
+# manually plot 1% percentile
+ax1.axvline(
+    x=np.percentile(gt_bboxes_diagonals, 1), color="red", linestyle="-"
+)
+ax1.text(
+    np.percentile(gt_bboxes_diagonals, 1),
+    2125,
+    "1%",
+    color="red",
+    ha="left",
+    va="bottom",
+)
+
+# ax1.set_title("GT bboxes diagonals")
+ax1.set_xlabel("diagonal (pixels)")
+ax1.set_ylabel("count")
+
+# Create secondary x-axis
+# ax2 = ax1.twiny()
+# ax2.tick_params(axis='x', labelcolor='r')
+# ax2.set_xticks(gt_diagonal_percentile_values)|
+# ax2.set_xticklabels(gt_diagonal_percentiles)
+
+
+# %%
+# Plot P/R on validation set per "last" model
+
+csv_file = Path(
+    "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1098734_0_17_val_set.csv"
+)
+
+# read csv
+df = pd.read_csv(csv_file)
+
+# add column for percentile
+if "full" in csv_file.stem:
+    df["percentile"] = [
+        map_run_slurm_977884_jobID_to_percentile[run_name.split("_")[-1]]
+        for run_name in df["trained_model/run_name"]
+    ]
+    df["percentile"] = df["percentile"].astype(int)
+
+else:
+    df["percentile"] = [
+        Path(file).stem.split("_")[-1]
+        for file in df["dataset/annotation_files"]
+    ]
+    df.loc[df["percentile"] == "gen", "percentile"] = "00"
+    df["percentile"] = df["percentile"].astype(int)
+
+# check if val or test set
+eval_set = "test" if df["cli_args/use_test_set"].all() else "val"
+print(f"Evaluating on {eval_set} set")
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# plot precision and recall on validation set
+fig, ax = plt.subplots(figsize=(10, 6))
+
+# Precision plot
+ax.scatter(
+    df["percentile"],
+    df[f"{eval_set}_precision"],
+    marker="o",
+    color="blue",
+    alpha=0.3,
+)
+half_width = 0.75
+ax.hlines(
+    df.groupby("percentile")[f"{eval_set}_precision"].mean().values,
+    df.groupby("percentile")[f"{eval_set}_precision"].mean().index
+    - half_width,
+    df.groupby("percentile")[f"{eval_set}_precision"].mean().index
+    + half_width,
+    linewidth=4,
+    color="blue",
+)
+ax.set_ylim(0.4, 1.00)
+ax.set_xlabel("model trained on bboxes > percentile")
+ax.set_ylabel(f"{eval_set} precision", color="blue")
+ax.tick_params(axis="y", labelcolor="blue")
+
+
+# Recall plot
+ax2 = ax.twinx()
+ax2.scatter(
+    df["percentile"],
+    df[f"{eval_set}_recall"],
+    marker="o",
+    color="red",
+    alpha=0.3,
+)
+half_width = 0.75
+ax.hlines(
+    df.groupby("percentile")[f"{eval_set}_recall"].mean().values,
+    df.groupby("percentile")[f"{eval_set}_recall"].mean().index - half_width,
+    df.groupby("percentile")[f"{eval_set}_recall"].mean().index + half_width,
+    linewidth=4,
+    color="red",
+)
+ax2.set_ylim(0.4, 1.00)
+ax2.set_ylabel(f"{eval_set} recall", color="red")
+ax2.tick_params(axis="y", labelcolor="red")
+
+
+plt.show()
+
+
+# %%
+# %matplotlib widget
+# %%

From 60e79ec54c967242fb2c80d380eff639a257e4c4 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 17 Jul 2025 21:21:09 +0100
Subject: [PATCH 23/72] Explore ensemble

---
 ethology/detectors/inference.py          |  46 ++--
 notebooks/notebook_combine_detections.py | 265 +++++++++++++++++++++++
 pyproject.toml                           |   2 +
 3 files changed, 296 insertions(+), 17 deletions(-)
 create mode 100644 notebooks/notebook_combine_detections.py

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 463989a2..8f990942 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -20,6 +20,25 @@ def _detections_per_image_id_as_ds(
     detections_per_image_id: dict,
 ) -> xr.Dataset:
     """Reshape detections per sample as xarray dataset."""
+    # Place tensors on cpu if required
+    if any(
+        [
+            any(
+                isinstance(detections[key], torch.Tensor) for key in detections
+            )
+            for detections in detections_per_image_id.values()
+        ]
+    ):
+        detections_per_image_id = {
+            image_id: {
+                key: value.cpu().numpy()
+                if isinstance(value, torch.Tensor)
+                else value
+                for key, value in detections.items()
+            }
+            for image_id, detections in detections_per_image_id.items()
+        }
+
     # Get coordinates
     list_image_id_coords = list(detections_per_image_id.keys())
     list_space_coords = ["x", "y"]
@@ -41,33 +60,28 @@ def _detections_per_image_id_as_ds(
 
     # Get lists of data arrays
     list_centroid_arrays = [
-        (
-            detections["boxes"].cpu().numpy()[:, 0:2]
-            + detections["boxes"].cpu().numpy()[:, 2:4]
-        )
-        * 0.5
+        (detections["boxes"][:, 0:2] + detections["boxes"][:, 2:4]) * 0.5
         for detections in detections_per_image_id.values()
     ]
 
     list_shape_arrays = [
-        detections["boxes"].cpu().numpy()[:, 2:4]
-        - detections["boxes"].cpu().numpy()[:, 0:2]
+        detections["boxes"][:, 2:4] - detections["boxes"][:, 0:2]
         for detections in detections_per_image_id.values()
     ]
 
     list_confidence_arrays = [
-        detections["scores"].cpu().numpy()  # .reshape(-1, 1)
+        detections["scores"]  # .reshape(-1, 1)
         for detections in detections_per_image_id.values()
     ]
 
     list_label_arrays = [
-        detections["labels"].cpu().numpy()  # .reshape(-1, 1)
+        detections["labels"]  # .reshape(-1, 1)
         for detections in detections_per_image_id.values()
     ]
 
     # Define arrays to create
     arrays_dict = {
-        "centroids": {  # --> change to position
+        "position": {  # --> before: centroids
             "data": list_centroid_arrays,
             "coords": coords_dict,
             "pad_value": np.nan,
@@ -152,13 +166,6 @@ def run_detector_on_dataset(
     return detections_dataset
 
 
-def _detections_per_batch_as_ds(
-    detections_per_batch: dict,
-) -> xr.Dataset:
-    """Reshape detections per batch as xarray dataset."""
-    pass
-
-
 def run_detector_on_dataloader(
     model: torch.nn.Module,
     dataloader: torch.utils.data.DataLoader,
@@ -190,6 +197,11 @@ def run_detector_on_dataloader(
         # Add to dict
         detections_per_batch[batch_idx] = detections_batch
 
+    # # Format as xarray dataset
+    # detections_dataset = _detections_per_image_id_as_ds(
+    #     detections_per_image_id
+    # )
+
     return detections_per_batch
 
 
diff --git a/notebooks/notebook_combine_detections.py b/notebooks/notebook_combine_detections.py
new file mode 100644
index 00000000..fc21a70c
--- /dev/null
+++ b/notebooks/notebook_combine_detections.py
@@ -0,0 +1,265 @@
+# Compute ensemble of detections
+# See
+# - https://docs.pytorch.org/tutorials/intermediate/ensembling.html
+# - https://discuss.pytorch.org/t/how-to-make-predictions-using-an-ensemble-of-models-in-parallel-on-a-single-gpu/202412/4
+
+# %%
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from ensemble_boxes import weighted_boxes_fusion
+from torch.utils.data import random_split
+
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.inference import _detections_per_image_id_as_ds
+
+xr.set_options(display_expand_attrs=False)
+
+# %matplotlib widget
+
+# %%
+# Input data
+
+
+# models
+list_models = [
+    # "above_0th", -- skip for now because diff image_ids
+    "above_1st",
+    "above_5th",
+    "above_10th",
+    "above_25th",
+    "above_50th",
+]
+timestamp_ref = "20250717_115247"
+predictions_dir = Path(
+    "/home/sminano/swc/project_ethology/remove_small_bboxes_inD_output"
+)
+model_to_path = {
+    model_key: predictions_dir
+    / f"{model_key}_detections_val_set_seed_42_{timestamp_ref}.nc"
+    for model_key in list_models
+}
+
+
+# dataset
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+full_gt_annotations_file = (
+    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+)
+image_width = 4096  # pixels
+image_height = 2160  # pixels
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Load dataset from full GT data
+
+
+def split_dataset_crab_repo(dataset_coco, seed_n, config):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Create COCO dataset
+dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=full_gt_annotations_file,
+    composed_transform=inference_transforms,
+)
+
+# Split dataset like in crabs repo
+train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+    dataset_coco,
+    seed_n=42,
+    config={"train_fraction": 0.8, "val_over_test_fraction": 0.5},
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Read detections
+
+ds_predictions_per_model = {
+    model_key: xr.open_dataset(model_to_path[model_key])
+    for model_key in list_models
+}
+
+# %%
+# Get list of image_ids
+list_image_ids = (
+    ds_predictions_per_model[list_models[0]].coords["image_id"].values.tolist()
+)
+
+# check the same image_ids are present in all models
+assert np.all(
+    [
+        ds_predictions_per_model[model_key].coords["image_id"].values.tolist()
+        == list_image_ids
+        for model_key in list_models[1:]
+    ]
+)
+
+# check image_ids match val dataset
+list_image_ids_val_set = [annot["image_id"] for _, annot in val_dataset]
+assert np.all(set(list_image_ids) == set(list_image_ids_val_set))
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute consensus detections
+
+model_weights = [1.0 / len(list_models) for _ in list_models]
+iou_thr = 0.5  # threshold for a match
+skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+sigma = 0.1
+
+detections_per_image_id = {}
+
+# for image_id in list_image_ids[:1]:
+for image, annots in [val_dataset[i] for i in range(10)]:
+    # Get image_id
+    image_id = annots["image_id"]
+
+    # Get predictions per modelfor this image_id
+    list_bboxes, list_labels, list_scores = [], [], []
+    for model_key in list_models:
+        # get predictions for this model and image_id
+        position = (
+            ds_predictions_per_model[model_key]
+            .centroids.sel(image_id=image_id)
+            .T
+        )
+        shape = (
+            ds_predictions_per_model[model_key].shape.sel(image_id=image_id).T
+        )
+        labels = (
+            ds_predictions_per_model[model_key].label.sel(image_id=image_id).T
+        )
+        confidence = (
+            ds_predictions_per_model[model_key]
+            .confidence.sel(image_id=image_id)
+            .T
+        )
+
+        # normalize coordinates to [0, 1]
+        img_width_height = np.array([image_width, image_height])[None, :]
+        x1_y1_norm = (position - shape / 2) / img_width_height
+        x2_y2_norm = (position + shape / 2) / img_width_height
+        x1_y1_x2_y2_norm = np.c_[x1_y1_norm, x2_y2_norm]
+
+        # append to list
+        list_bboxes.append(x1_y1_x2_y2_norm)
+        list_labels.append(labels)
+        list_scores.append(confidence)
+
+    # compute soft nms
+    # ensemble_x1_y1_x2_y2_norm, ensemble_scores, ensemble_labels = soft_nms(
+    ensemble_x1_y1_x2_y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            list_bboxes,
+            list_scores,
+            list_labels,
+            # weights=model_weights,
+            iou_thr=iou_thr,
+            # sigma=sigma,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    # add ensemble results to dict
+    ensemble_x1_y1_x2_y2 = ensemble_x1_y1_x2_y2_norm * np.tile(
+        img_width_height, (1, 2)
+    )
+    detections_per_image_id[image_id] = {
+        "boxes": ensemble_x1_y1_x2_y2,
+        "scores": ensemble_scores,
+        "labels": ensemble_labels,
+    }
+
+    # ------------------------------------------------------------
+    # plot
+    plt.figure(figsize=(10, 10))
+    plt.imshow(image.permute(1, 2, 0).numpy())
+
+    # plot GT annotations as green boxes
+    for gt_box in annots["boxes"]:
+        plt.gca().add_patch(
+            plt.Rectangle(
+                (gt_box[0], gt_box[1]),
+                gt_box[2] - gt_box[0],
+                gt_box[3] - gt_box[1],
+                fill=False,
+                edgecolor=(0, 1, 0),
+                linewidth=1,
+            )
+        )
+
+    # plot ensemble detections as red boxes
+    for pred_box in ensemble_x1_y1_x2_y2:
+        plt.gca().add_patch(
+            plt.Rectangle(
+                (pred_box[0], pred_box[1]),
+                pred_box[2] - pred_box[0],
+                pred_box[3] - pred_box[1],
+                fill=False,
+                edgecolor="r",
+                linewidth=1,
+            )
+        )
+    plt.title(f"Image {image_id}")
+    # plt.show()
+
+# %%
+# ensemble results as a xarray dataset
+
+ensemble_ds = _detections_per_image_id_as_ds(detections_per_image_id)
+
+
+ensemble_ds.attrs["models"] = list_models
+# ensemble_ds.attrs["iou_thr"] = iou_thr
+# ensemble_ds.attrs["skip_box_thr"] = skip_box_thr
+# ensemble_ds.attrs["sigma"] = sigma
+# ensemble_ds.attrs["model_weights"] = model_weights
+
+# save ensemble results
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ensemble_ds.to_netcdf(
+    predictions_dir / f"ensemble_detections_val_set_seed_42_{timestamp}.nc"
+)
+# %%
diff --git a/pyproject.toml b/pyproject.toml
index 7616a0ac..5061bf16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,8 @@ dependencies = [
   "movement",
   "mlflow-skinny",
   "netCDF4",
+  "torch",
+  "ensemble-boxes",
 ]
 
 [project.urls]

From 0a9b8804cba6ccbd607691a5ea5b199a798e015c Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 17 Jul 2025 21:21:42 +0100
Subject: [PATCH 24/72] Small edits to other notebooks

---
 notebooks/notebook_evaluate_binned_performance.py | 2 +-
 notebooks/notebook_mlflow_plots.py                | 6 +++---
 notebooks/notebook_run_detection_on_dataset.py    | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index 9f914e29..e69d64b6 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -16,6 +16,7 @@
 import xarray as xr
 from pycocotools.coco import COCO
 from torch.utils.data import random_split
+from torchvision.utils import draw_bounding_boxes
 
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.evaluate import evaluate_detections_hungarian
@@ -24,7 +25,6 @@
     read_config_from_mlflow_params,
     read_mlflow_params,
 )
-from torchvision.utils import draw_bounding_boxes
 
 # Set xarray options
 xr.set_options(display_expand_attrs=False)
diff --git a/notebooks/notebook_mlflow_plots.py b/notebooks/notebook_mlflow_plots.py
index 9a2ba9d7..372bac8d 100644
--- a/notebooks/notebook_mlflow_plots.py
+++ b/notebooks/notebook_mlflow_plots.py
@@ -127,10 +127,10 @@
 
 
 # %%
-# Plot P/R on validation set per "last" model
+# Prepare data
 
 csv_file = Path(
-    "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1098734_0_17_val_set.csv"
+    "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1103832_0_17_val_set_full.csv"
 )
 
 # read csv
@@ -157,7 +157,7 @@
 print(f"Evaluating on {eval_set} set")
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# plot precision and recall on validation set
+# plot precision and recall 
 fig, ax = plt.subplots(figsize=(10, 6))
 
 # Precision plot
diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_dataset.py
index 8e6adc5f..66f5d96d 100644
--- a/notebooks/notebook_run_detection_on_dataset.py
+++ b/notebooks/notebook_run_detection_on_dataset.py
@@ -175,7 +175,6 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     detections_ds.attrs["coco_crabs_dataset_split"] = "val"
 
     # ------------------------------------
-    # Save detections dataset and evaluation dataset
     # Save detections dataset
     detections_ds.to_netcdf(
         output_dir

From 81549f54bd4605d5bac1d4dc5dc68407bf8a4cba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 17 Jul 2025 20:22:01 +0000
Subject: [PATCH 25/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../notebook_evaluate_binned_performance.py   | 43 +++++++++----------
 notebooks/notebook_mlflow_plots.py            |  6 ++-
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index e69d64b6..1632ad01 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -5,7 +5,6 @@
 """
 
 # %%
-import pickle
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -16,7 +15,6 @@
 import xarray as xr
 from pycocotools.coco import COCO
 from torch.utils.data import random_split
-from torchvision.utils import draw_bounding_boxes
 
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.evaluate import evaluate_detections_hungarian
@@ -227,8 +225,10 @@ def discretize_based_on_bbox_diagonal(
     return predictions_df, gt_annotations_df
 
 
-def plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, model_key):
-    "Plot true positives per diagonal bin"
+def plot_true_positives_per_bin(
+    predictions_df, predictions_per_diagonal_bin, model_key
+):
+    """Plot true positives per diagonal bin"""
     true_positives_counts = pd.DataFrame(
         {
             "Predictions": predictions_per_diagonal_bin,
@@ -261,7 +261,6 @@ def plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, mo
     ax.set_ylim(0.0, 425)
     ax.grid(True, alpha=0.3)
 
-
     # add line plot for precision on right y-axis
     ax2 = ax.twinx()
     ax2.plot(
@@ -285,8 +284,9 @@ def plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, mo
     plt.show()
 
 
-def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model_key):
-
+def plot_missed_detections_per_bin(
+    gt_annotations_df, gt_per_diagonal_bin, model_key
+):
     missed_detections_counts = pd.DataFrame(
         {
             "Ground Truth": gt_per_diagonal_bin,
@@ -305,7 +305,9 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
 
     # Plot as bar chart
     fig, ax = plt.subplots(1, 1, figsize=(10, 6))
-    missed_detections_counts.loc[:, ["Ground Truth", "Matched Ground Truth"]].plot(
+    missed_detections_counts.loc[
+        :, ["Ground Truth", "Matched Ground Truth"]
+    ].plot(
         kind="bar",
         ax=ax,
         figsize=(12, 6),
@@ -319,7 +321,6 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
     ax.set_ylim(0.0, 400)
     ax.grid(True, alpha=0.3)
 
-
     # add line plot for recall on right y-axis
     ax2 = ax.twinx()
     ax2.plot(
@@ -333,7 +334,6 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
     ax2.set_ylabel("Recall", color="blue")
     ax2.set_ylim(0.0, 1.00)  # Recall is between 0 and 1
 
-
     # add reference line at 0.85
     ax2.axhline(y=0.85, color="blue", linestyle="--", linewidth=1)
 
@@ -435,7 +435,6 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
     config = read_config_from_mlflow_params(mlflow_params)
     cli_args = read_cli_args_from_mlflow_params(mlflow_params)
 
-
     # Create COCO dataset
     # Fix for model trained on all annotations
     # (VIA_JSON_combined_coco_gen has different image IDs than the rest)
@@ -445,13 +444,13 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
     ):
         dataset_coco = create_coco_dataset(
             images_dir=Path(dataset_dir) / "frames",
-            annotations_file=annotations_dir / "VIA_JSON_combined_coco_gen.json",
+            annotations_file=annotations_dir
+            / "VIA_JSON_combined_coco_gen.json",
             composed_transform=inference_transforms,
         )
     else:
         dataset_coco = default_dataset_coco
 
-
     # Split dataset like in crabs repo
     train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
         dataset_coco,
@@ -485,8 +484,7 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
         bin_labels,
     )
 
-
-    #---------------------------------------
+    # ---------------------------------------
     # Plot boxes in each diagonal bin in validation set
     predictions_per_diagonal_bin = (
         predictions_df["diagonal_bins"].value_counts().sort_index()
@@ -519,16 +517,17 @@ def plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model
     plt.tight_layout()
     plt.show()
 
-    #---------------------------------------
+    # ---------------------------------------
     # Plot true positives per bin
-    plot_true_positives_per_bin(predictions_df, predictions_per_diagonal_bin, model_key)
-    
+    plot_true_positives_per_bin(
+        predictions_df, predictions_per_diagonal_bin, model_key
+    )
 
-    #---------------------------------------
+    # ---------------------------------------
     # Plot missed detections per bin
-    plot_missed_detections_per_bin(gt_annotations_df, gt_per_diagonal_bin, model_key)
-
-
+    plot_missed_detections_per_bin(
+        gt_annotations_df, gt_per_diagonal_bin, model_key
+    )
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/notebooks/notebook_mlflow_plots.py b/notebooks/notebook_mlflow_plots.py
index 372bac8d..399993ba 100644
--- a/notebooks/notebook_mlflow_plots.py
+++ b/notebooks/notebook_mlflow_plots.py
@@ -49,7 +49,9 @@
 # Compute bins using full GT annotations
 # We bin the size of the bbox diagonal
 
-full_gt_annotations_file = annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+full_gt_annotations_file = (
+    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+)
 coco_full_gt = COCO(str(full_gt_annotations_file))
 
 # compute diagonals for each gt annotation
@@ -157,7 +159,7 @@
 print(f"Evaluating on {eval_set} set")
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# plot precision and recall 
+# plot precision and recall
 fig, ax = plt.subplots(figsize=(10, 6))
 
 # Precision plot

From 229fa9f13aef1d4ea512e8d381871d3f802d056e Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 18 Jul 2025 18:21:18 +0100
Subject: [PATCH 26/72] Save frames, compute precision and recall per frame

---
 notebooks/notebook_combine_detections.py | 274 +++++++++++++++++------
 1 file changed, 210 insertions(+), 64 deletions(-)

diff --git a/notebooks/notebook_combine_detections.py b/notebooks/notebook_combine_detections.py
index fc21a70c..50db52d9 100644
--- a/notebooks/notebook_combine_detections.py
+++ b/notebooks/notebook_combine_detections.py
@@ -4,11 +4,14 @@
 # - https://discuss.pytorch.org/t/how-to-make-predictions-using-an-ensemble-of-models-in-parallel-on-a-single-gpu/202412/4
 
 # %%
+import json
 from datetime import datetime
 from pathlib import Path
 
+import cv2
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
@@ -16,6 +19,7 @@
 from torch.utils.data import random_split
 
 from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.evaluate import evaluate_detections_hungarian
 from ethology.detectors.inference import _detections_per_image_id_as_ds
 
 xr.set_options(display_expand_attrs=False)
@@ -23,10 +27,94 @@
 # %matplotlib widget
 
 # %%
-# Input data
+# Helper function for plotting and saving ensemble detections
+
+
+def plot_and_save_ensemble_detections(
+    image,
+    gt_boxes_x1_y1_x2_y2,
+    pred_boxes_x1_y1_x2_y2,
+    pred_boxes_scores,
+    image_id,
+    output_dir,
+    precision,
+    recall,
+):
+    """Plot ground truth and ensemble detections on image and save as PNG."""
+    # Convert tensor to numpy array and transpose from (C, H, W) to (H, W, C)
+    # Convert from float [0,1] to uint8 [0,255] for OpenCV
+    # Convert from RGB to BGR for OpenCV
+    image_cv = image.permute(1, 2, 0).numpy()
+    image_cv = (image_cv * 255).astype(np.uint8)
+    image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
+
+    # plot GT annotations as green boxes
+    for gt_box in gt_boxes_x1_y1_x2_y2:
+        x1, y1, x2, y2 = gt_box.cpu().numpy().astype(int)
+        cv2.rectangle(
+            image_cv,
+            (x1, y1),
+            (x2, y2),
+            (0, 255, 0),  # Green color in BGR
+            2,  # Line thickness
+        )
 
+    # plot ensemble detections as red boxes
+    for pred_box, confidence in zip(
+        pred_boxes_x1_y1_x2_y2, pred_boxes_scores, strict=True
+    ):
+        x1, y1, x2, y2 = pred_box.astype(int)
+
+        cv2.rectangle(
+            image_cv,
+            (x1, y1),
+            (x2, y2),
+            (0, 0, 255),  # Red color in BGR
+            2,  # Line thickness
+        )
+
+        # add text with confidence score
+        text = f"{confidence:.2f}"
+        cv2.putText(
+            image_cv,
+            text,
+            (x1, y1 - 3),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,  # Font scale
+            (0, 0, 255),  # Red color in BGR
+            2,  # Line thickness
+        )
+
+    # add text with precision and recall to bottom right corner
+    cv2.putText(
+        image_cv,
+        f"Precision: {precision:.2f}",
+        (image_cv.shape[1] - 400, image_cv.shape[0] - 100),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.5,  # Font scale
+        (0, 0, 255),  # Red color in BGR
+        4,  # Line thickness
+    )
+    cv2.putText(
+        image_cv,
+        f"Recall: {recall:.2f}",
+        (image_cv.shape[1] - 400, image_cv.shape[0] - 50),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.5,  # Font scale
+        (0, 0, 255),  # Red color in BGR
+        4,  # Line thickness
+    )
 
-# models
+    # Save the image as PNG
+    output_filename = output_dir / f"val_set_{image_id:06d}.png"
+    cv2.imwrite(str(output_filename), image_cv)
+    print(f"Saved: {output_filename}")
+
+    return image_cv
+
+
+# %%
+# Input data
 list_models = [
     # "above_0th", -- skip for now because diff image_ids
     "above_1st",
@@ -55,6 +143,11 @@
 image_width = 4096  # pixels
 image_height = 2160  # pixels
 
+# output directory
+output_dir = Path("/home/sminano/swc/project_ethology/ensemble_detections")
+output_dir.mkdir(parents=True, exist_ok=True)
+
+flag_save_images = True
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Load dataset from full GT data
@@ -98,21 +191,6 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     ]
 )
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Create COCO dataset
-dataset_coco = create_coco_dataset(
-    images_dir=Path(dataset_dir) / "frames",
-    annotations_file=full_gt_annotations_file,
-    composed_transform=inference_transforms,
-)
-
-# Split dataset like in crabs repo
-train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
-    dataset_coco,
-    seed_n=42,
-    config={"train_fraction": 0.8, "val_over_test_fraction": 0.5},
-)
-
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Read detections
@@ -122,7 +200,9 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     for model_key in list_models
 }
 
-# %%
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Get data from predictions
+
 # Get list of image_ids
 list_image_ids = (
     ds_predictions_per_model[list_models[0]].coords["image_id"].values.tolist()
@@ -137,26 +217,75 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     ]
 )
 
-# check image_ids match val dataset
+# get common seed
+seed_n = np.unique(
+    [ds.attrs["seed_n"] for ds in ds_predictions_per_model.values()]
+)
+assert seed_n.shape == (1,)
+seed_n = seed_n.item()
+
+# get common config
+config_split = [
+    json.loads(ds.attrs["config"]) for ds in ds_predictions_per_model.values()
+]
+config_split = [
+    {
+        "train_fraction": cfg["train_fraction"],
+        "val_over_test_fraction": cfg["val_over_test_fraction"],
+    }
+    for cfg in config_split
+]
+assert all([cfg == config_split[0] for cfg in config_split[1:]])
+config_split = config_split[0]
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Create COCO dataset
+print(full_gt_annotations_file)
+dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=full_gt_annotations_file,
+    composed_transform=inference_transforms,
+)
+
+# Split dataset like in crabs repo
+train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+    dataset_coco,
+    seed_n=seed_n,
+    config=config_split,
+)
+
+
+# check image_ids match image_ids in val dataset
 list_image_ids_val_set = [annot["image_id"] for _, annot in val_dataset]
 assert np.all(set(list_image_ids) == set(list_image_ids_val_set))
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Compute consensus detections
 
-model_weights = [1.0 / len(list_models) for _ in list_models]
-iou_thr = 0.5  # threshold for a match
+# model_weights = [1.0 / len(list_models) for _ in list_models]
+iou_thr_ensemble = 0.5  # threshold for a match
 skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
 sigma = 0.1
 
 detections_per_image_id = {}
+precision_recall_per_sample = {}
 
-# for image_id in list_image_ids[:1]:
-for image, annots in [val_dataset[i] for i in range(10)]:
-    # Get image_id
-    image_id = annots["image_id"]
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+timestamped_output_dir = output_dir / timestamp
+timestamped_output_dir.mkdir(parents=True, exist_ok=True)
+
+if flag_save_images:
+    (timestamped_output_dir / "frames").mkdir(parents=True, exist_ok=True)
 
-    # Get predictions per modelfor this image_id
+iou_threshold_precision = 0.1  # threshold for a TP
+
+
+# Loop thru samples in val set
+for k, (image, annots) in enumerate(val_dataset):
+
+    # Get predictions per model for this image_id
+    image_id = annots["image_id"]
     list_bboxes, list_labels, list_scores = [], [], []
     for model_key in list_models:
         # get predictions for this model and image_id
@@ -190,18 +319,27 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
 
     # compute soft nms
     # ensemble_x1_y1_x2_y2_norm, ensemble_scores, ensemble_labels = soft_nms(
+
+    # compute weighted boxes fusion
+    # sometimes returns nan?
     ensemble_x1_y1_x2_y2_norm, ensemble_scores, ensemble_labels = (
         weighted_boxes_fusion(
             list_bboxes,
             list_scores,
             list_labels,
             # weights=model_weights,
-            iou_thr=iou_thr,
+            iou_thr=iou_thr_ensemble,
             # sigma=sigma,
             skip_box_thr=skip_box_thr,
         )
     )
 
+    # remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1_y1_x2_y2_norm), axis=1)
+    ensemble_x1_y1_x2_y2_norm = ensemble_x1_y1_x2_y2_norm[~slc_nan_rows]
+    ensemble_scores = ensemble_scores[~slc_nan_rows]
+    ensemble_labels = ensemble_labels[~slc_nan_rows]
+
     # add ensemble results to dict
     ensemble_x1_y1_x2_y2 = ensemble_x1_y1_x2_y2_norm * np.tile(
         img_width_height, (1, 2)
@@ -212,54 +350,62 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
         "labels": ensemble_labels,
     }
 
-    # ------------------------------------------------------------
-    # plot
-    plt.figure(figsize=(10, 10))
-    plt.imshow(image.permute(1, 2, 0).numpy())
+    # compute precision per frame
+    tp, fp, md = evaluate_detections_hungarian(
+        ensemble_x1_y1_x2_y2, annots["boxes"], iou_threshold_precision
+    )
 
-    # plot GT annotations as green boxes
-    for gt_box in annots["boxes"]:
-        plt.gca().add_patch(
-            plt.Rectangle(
-                (gt_box[0], gt_box[1]),
-                gt_box[2] - gt_box[0],
-                gt_box[3] - gt_box[1],
-                fill=False,
-                edgecolor=(0, 1, 0),
-                linewidth=1,
-            )
-        )
+    precision_recall_per_sample[k] = {
+        "image_id": image_id,
+        "precision": sum(tp) / (sum(tp) + sum(fp)),
+        "recall": sum(tp) / (sum(tp) + sum(md)),
+    }
 
-    # plot ensemble detections as red boxes
-    for pred_box in ensemble_x1_y1_x2_y2:
-        plt.gca().add_patch(
-            plt.Rectangle(
-                (pred_box[0], pred_box[1]),
-                pred_box[2] - pred_box[0],
-                pred_box[3] - pred_box[1],
-                fill=False,
-                edgecolor="r",
-                linewidth=1,
-            )
+    # ------------------------------------------------------------
+    # plot and save ensemble detections
+    if flag_save_images:
+        plot_and_save_ensemble_detections(
+            image=image,
+            gt_boxes_x1_y1_x2_y2=annots["boxes"],
+            pred_boxes_x1_y1_x2_y2=ensemble_x1_y1_x2_y2,
+            pred_boxes_scores=ensemble_scores,
+            image_id=image_id,
+            output_dir=timestamped_output_dir / "frames",
+            precision=precision_recall_per_sample[k]["precision"],
+            recall=precision_recall_per_sample[k]["recall"],
         )
-    plt.title(f"Image {image_id}")
-    # plt.show()
 
 # %%
-# ensemble results as a xarray dataset
+# average precision and recall
+df_precision_recall = pd.DataFrame.from_dict(
+    precision_recall_per_sample, orient="index"
+)
 
-ensemble_ds = _detections_per_image_id_as_ds(detections_per_image_id)
+# cast image_id to int
+df_precision_recall["image_id"] = df_precision_recall["image_id"].astype(int)
+
+print(df_precision_recall)
+print(df_precision_recall.shape)
+print(df_precision_recall.loc[:, ["precision", "recall"]].mean())
+
+# add mean to df
+df_precision_recall.loc["mean"] = df_precision_recall.mean()
+
+# save as csv
+df_precision_recall.to_csv(
+    timestamped_output_dir / "precision_recall.csv", index=False
+)
 
 
+# %%
+# ensemble results as a xarray dataset
+
+ensemble_ds = _detections_per_image_id_as_ds(detections_per_image_id)
 ensemble_ds.attrs["models"] = list_models
-# ensemble_ds.attrs["iou_thr"] = iou_thr
-# ensemble_ds.attrs["skip_box_thr"] = skip_box_thr
-# ensemble_ds.attrs["sigma"] = sigma
-# ensemble_ds.attrs["model_weights"] = model_weights
 
 # save ensemble results
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 ensemble_ds.to_netcdf(
-    predictions_dir / f"ensemble_detections_val_set_seed_42_{timestamp}.nc"
+    timestamped_output_dir
+    / f"ensemble_detections_val_set_seed_42_{timestamp}.nc"
 )
 # %%

From 7566adf8e28ad4f3be90b42964b67ecc2054d9d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 18 Jul 2025 17:22:33 +0000
Subject: [PATCH 27/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 notebooks/notebook_combine_detections.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/notebooks/notebook_combine_detections.py b/notebooks/notebook_combine_detections.py
index 50db52d9..68b8af39 100644
--- a/notebooks/notebook_combine_detections.py
+++ b/notebooks/notebook_combine_detections.py
@@ -9,7 +9,6 @@
 from pathlib import Path
 
 import cv2
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import torch
@@ -283,7 +282,6 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
 
 # Loop thru samples in val set
 for k, (image, annots) in enumerate(val_dataset):
-
     # Get predictions per model for this image_id
     image_id = annots["image_id"]
     list_bboxes, list_labels, list_scores = [], [], []

From 78de2973471ed59bc8ba73daf61c9d43566c44e2 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 18 Jul 2025 18:23:35 +0100
Subject: [PATCH 28/72] cast as pytorch tensor in evaluate

---
 ethology/detectors/evaluate.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index d07baba8..4488273c 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -8,7 +8,7 @@
 
 def evaluate_detections_hungarian(
     pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
-) -> dict:
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Compute true positives, false positives, and missed detections.
 
     Uses Hungarian algorithm for matching.
@@ -29,7 +29,8 @@ def evaluate_detections_hungarian(
     tuple
         A tuple of three boolean arrays:
         - true_positives: True for each predicted bbox that is a true positive
-        - false_positives: True for each predicted bbox that is a false positive
+        - false_positives: True for each predicted bbox that is a false
+        positive
         - missed_detections: True for each ground truth bbox that is missed
 
     """
@@ -39,16 +40,15 @@ def evaluate_detections_hungarian(
     matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
     missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
 
+    # cast as a tensor if not already
+    if not isinstance(pred_bboxes, torch.Tensor):
+        pred_bboxes = torch.tensor(pred_bboxes, dtype=torch.float32)
+    if not isinstance(gt_bboxes, torch.Tensor):
+        gt_bboxes = torch.tensor(gt_bboxes, dtype=torch.float32)
+
     if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
         # Compute IoU matrix (pred_bboxes x gt_bboxes)
-        iou_matrix = (
-            ops.box_iou(
-                torch.tensor(pred_bboxes[:, :4], dtype=torch.float32),
-                torch.tensor(gt_bboxes, dtype=torch.float32),
-            )
-            .cpu()
-            .numpy()
-        )
+        iou_matrix = ops.box_iou(pred_bboxes[:, :4], gt_bboxes).cpu().numpy()
 
         # Use Hungarian algorithm to find optimal assignment
         pred_indices, gt_indices = linear_sum_assignment(

From 93d2e0881f6be9c4b5f6c4f8b13e224b825fc854 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:12:08 +0100
Subject: [PATCH 29/72] Small changes and rename

---
 notebooks/notebook_combine_detections.py                  | 8 ++++----
 notebooks/notebook_mlflow_plots.py                        | 3 ++-
 ...taset.py => notebook_run_detection_on_eval_dataset.py} | 0
 3 files changed, 6 insertions(+), 5 deletions(-)
 rename notebooks/{notebook_run_detection_on_dataset.py => notebook_run_detection_on_eval_dataset.py} (100%)

diff --git a/notebooks/notebook_combine_detections.py b/notebooks/notebook_combine_detections.py
index 68b8af39..61f6648a 100644
--- a/notebooks/notebook_combine_detections.py
+++ b/notebooks/notebook_combine_detections.py
@@ -26,7 +26,7 @@
 # %matplotlib widget
 
 # %%
-# Helper function for plotting and saving ensemble detections
+# Helper function
 
 
 def plot_and_save_ensemble_detections(
@@ -135,10 +135,10 @@ def plot_and_save_ensemble_detections(
 
 # dataset
 dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
-full_gt_annotations_file = (
-    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+full_gt_annotations_file = Path(
+    "/home/sminano/swc/project_ethology/large_annotations/VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
 )
+
 image_width = 4096  # pixels
 image_height = 2160  # pixels
 
diff --git a/notebooks/notebook_mlflow_plots.py b/notebooks/notebook_mlflow_plots.py
index 399993ba..b6b58ddf 100644
--- a/notebooks/notebook_mlflow_plots.py
+++ b/notebooks/notebook_mlflow_plots.py
@@ -130,7 +130,8 @@
 
 # %%
 # Prepare data
-
+# run_slurm_1103832_0_17_val_set_full -- evaluated using corrected full GT annotations
+# (with 0-based image ID)
 csv_file = Path(
     "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1103832_0_17_val_set_full.csv"
 )
diff --git a/notebooks/notebook_run_detection_on_dataset.py b/notebooks/notebook_run_detection_on_eval_dataset.py
similarity index 100%
rename from notebooks/notebook_run_detection_on_dataset.py
rename to notebooks/notebook_run_detection_on_eval_dataset.py

From 7aa9747196f6bb1f452cbc29a7a2f65b321f50bc Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:20:37 +0100
Subject: [PATCH 30/72] Add IOU assigned to each true positive as output to
 Hungarian algorithm

---
 ethology/detectors/evaluate.py                    | 5 ++++-
 notebooks/notebook_combine_detections.py          | 2 +-
 notebooks/notebook_evaluate_binned_performance.py | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index 4488273c..380449d6 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -40,6 +40,8 @@ def evaluate_detections_hungarian(
     matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
     missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
 
+    true_positives_iou = np.zeros(len(pred_bboxes), dtype=bool)
+
     # cast as a tensor if not already
     if not isinstance(pred_bboxes, torch.Tensor):
         pred_bboxes = torch.tensor(pred_bboxes, dtype=torch.float32)
@@ -60,6 +62,7 @@ def evaluate_detections_hungarian(
             if iou_matrix[pred_idx, gt_idx] > iou_threshold:
                 true_positives[pred_idx] = True
                 matched_gts[gt_idx] = True
+                true_positives_iou[pred_idx] = iou_matrix[pred_idx, gt_idx]
             else:
                 false_positives[pred_idx] = True
 
@@ -76,4 +79,4 @@ def evaluate_detections_hungarian(
         # No ground truth, all predictions are false positives
         false_positives[:] = True
 
-    return true_positives, false_positives, missed_detections
+    return true_positives, false_positives, missed_detections, true_positives_iou
diff --git a/notebooks/notebook_combine_detections.py b/notebooks/notebook_combine_detections.py
index 61f6648a..e5983a8c 100644
--- a/notebooks/notebook_combine_detections.py
+++ b/notebooks/notebook_combine_detections.py
@@ -349,7 +349,7 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     }
 
     # compute precision per frame
-    tp, fp, md = evaluate_detections_hungarian(
+    tp, fp, md, _ = evaluate_detections_hungarian(
         ensemble_x1_y1_x2_y2, annots["boxes"], iou_threshold_precision
     )
 
diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index 1632ad01..f96e81c8 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -117,7 +117,7 @@ def compute_pred_gt_tables(iou_threshold, ds_predictions, val_dataset):
         gt_bboxes = annotations["boxes"].cpu().numpy()
 
         # Evaluate detections
-        tp, fp, md = evaluate_detections_hungarian(
+        tp, fp, md, _ = evaluate_detections_hungarian(
             pred_bboxes, gt_bboxes, iou_threshold
         )
 

From ce04965a394b98b76efb18c489ad63855213abbd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 23 Jul 2025 10:20:56 +0000
Subject: [PATCH 31/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ethology/detectors/evaluate.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index 380449d6..ec5df701 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -79,4 +79,9 @@ def evaluate_detections_hungarian(
         # No ground truth, all predictions are false positives
         false_positives[:] = True
 
-    return true_positives, false_positives, missed_detections, true_positives_iou
+    return (
+        true_positives,
+        false_positives,
+        missed_detections,
+        true_positives_iou,
+    )

From ed428501771143d39829e2e4df2639c176e08dab Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 24 Jul 2025 17:28:23 +0100
Subject: [PATCH 32/72] Add image width and height as attributes to detections
 dataset

---
 ethology/detectors/inference.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 8f990942..1a4b028d 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -20,7 +20,7 @@ def _detections_per_image_id_as_ds(
     detections_per_image_id: dict,
 ) -> xr.Dataset:
     """Reshape detections per sample as xarray dataset."""
-    # Place tensors on cpu if required
+    # Place any tensors on cpu if required
     if any(
         [
             any(
@@ -145,24 +145,30 @@ def run_detector_on_dataset(
     # Ensure model is in evaluation mode
     model.eval()
 
-    # Run detection
+    # Run detection for each sample in the dataset
     detections_per_image_id = {}
     for image, annotations in dataset:
         # Place image tensor on device and add batch dimension
         image = image.to(device)[None]  # [1, C, H, W]
 
-        # Run detection
         with torch.no_grad():
-            detections = model(image)[0]  # select single batch dimension
+            detections = model(image)
 
         # Add to dict with key = image_id
-        detections_per_image_id[annotations["image_id"]] = detections
+        # [0] to select single batch dimension
+        detections_per_image_id[annotations["image_id"]] = detections[0]
 
     # Format as xarray dataset
     detections_dataset = _detections_per_image_id_as_ds(
         detections_per_image_id
     )
 
+    # Add image_width and image_height as attributes
+    # (we assume all images in the dataset have the same width and height
+    # as the last image)
+    detections_dataset.attrs["image_width"] = image.shape[-2]
+    detections_dataset.attrs["image_height"] = image.shape[-1]
+
     return detections_dataset
 
 

From c6e14d61ca4f808d57c510432f70c98f0f3a876b Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 24 Jul 2025 17:29:05 +0100
Subject: [PATCH 33/72] Load models to cpu

---
 ethology/detectors/load.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ethology/detectors/load.py b/ethology/detectors/load.py
index 55a2804d..78ff34dc 100644
--- a/ethology/detectors/load.py
+++ b/ethology/detectors/load.py
@@ -18,8 +18,11 @@ def load_fasterrcnn_resnet50_fpn_v2(
     )
 
     # load state dict
-    checkpoint = torch.load(trained_model_path)
+    # When you call torch.load() on a file which contains GPU tensors,
+    # those tensors will be loaded to GPU by default.
+    checkpoint = torch.load(trained_model_path, map_location="cpu")
 
+    # Load weights into model
     # if model is saved with model. prefix, remove it
     if any([ky.startswith("model.") for ky in checkpoint["state_dict"]]):
         model_weights = {
@@ -29,8 +32,6 @@ def load_fasterrcnn_resnet50_fpn_v2(
         }
     else:
         model_weights = checkpoint["state_dict"]  # ok?
-
-    # Load weights into model
     model.load_state_dict(model_weights)
 
     # Put model on device if provided

From 94f3ab4bfca06328ee4a885a1d123094ca722f91 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 24 Jul 2025 17:30:37 +0100
Subject: [PATCH 34/72] Add utils to transform detection datasets

---
 ethology/detectors/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 ethology/detectors/utils.py

diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
new file mode 100644
index 00000000..5abe4b2f
--- /dev/null
+++ b/ethology/detectors/utils.py
@@ -0,0 +1,8 @@
+"""Utility functions for transforming detection datasets."""
+
+
+def add_bboxes_min_max_corners(ds):
+    """Add xy_min and xy_max arrays to ds."""
+    ds["xy_min"] = ds.position - 0.5 * ds.shape
+    ds["xy_max"] = ds.position + 0.5 * ds.shape
+    return ds

From 97f01362f53cc621e01f25ea94f1dad537b7a607 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 24 Jul 2025 17:31:00 +0100
Subject: [PATCH 35/72] Notebook to run an ensemble on a dataset

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 410 ++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100644 notebooks/notebook_run_ensemble_on_eval_dataset.py

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
new file mode 100644
index 00000000..1f6ca9bd
--- /dev/null
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -0,0 +1,410 @@
+# %%
+import json
+from datetime import datetime
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from ensemble_boxes import weighted_boxes_fusion
+from torch.utils.data import random_split
+
+from ethology.ethology.detectors.utils import add_bboxes_min_max_corners
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.inference import (
+    _detections_per_image_id_as_ds,
+    run_detector_on_dataset,
+)
+from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data
+
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+annotations_file_path = (
+    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+)
+
+experiment_ID = "617393114420881798"
+ml_runs_experiment_dir = (
+    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
+)
+
+# I pick seed 42 for each set of models
+models_dict = {
+    # "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
+    "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
+    "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
+    "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
+    "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+print(f"Using device: {device}")
+
+
+# %%
+# Helper functions
+def split_dataset_crab_repo(dataset_coco, seed_n, config):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
+def plot_and_save_ensemble_detections(
+    image,
+    gt_boxes_x1_y1_x2_y2,
+    pred_boxes_x1_y1_x2_y2,
+    pred_boxes_scores,
+    image_id,
+    output_dir,
+    precision,
+    recall,
+):
+    """Plot ground truth and ensemble detections on image and save as PNG."""
+    # Convert tensor to numpy array and transpose from (C, H, W) to (H, W, C)
+    # Convert from float [0,1] to uint8 [0,255] for OpenCV
+    # Convert from RGB to BGR for OpenCV
+    image_cv = image.permute(1, 2, 0).numpy()
+    image_cv = (image_cv * 255).astype(np.uint8)
+    image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
+
+    # plot GT annotations as green boxes
+    for gt_box in gt_boxes_x1_y1_x2_y2:
+        x1, y1, x2, y2 = gt_box.cpu().numpy().astype(int)
+        cv2.rectangle(
+            image_cv,
+            (x1, y1),
+            (x2, y2),
+            (0, 255, 0),  # Green color in BGR
+            2,  # Line thickness
+        )
+
+    # remove nan predictions
+    pred_boxes_x1_y1_x2_y2 = pred_boxes_x1_y1_x2_y2[
+        ~np.any(np.isnan(pred_boxes_x1_y1_x2_y2), axis=1)
+    ]
+    pred_boxes_scores = pred_boxes_scores[~np.isnan(pred_boxes_scores)]
+
+    # plot ensemble detections as red boxes
+    for pred_box, confidence in zip(
+        pred_boxes_x1_y1_x2_y2, pred_boxes_scores, strict=True
+    ):
+        x1, y1, x2, y2 = pred_box.astype(int)
+
+        cv2.rectangle(
+            image_cv,
+            (x1, y1),
+            (x2, y2),
+            (0, 0, 255),  # Red color in BGR
+            2,  # Line thickness
+        )
+
+        # add text with confidence score
+        text = f"{confidence:.2f}"
+        cv2.putText(
+            image_cv,
+            text,
+            (x1, y1 - 3),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,  # Font scale
+            (0, 0, 255),  # Red color in BGR
+            2,  # Line thickness
+        )
+
+    # add text with precision and recall to bottom right corner
+    cv2.putText(
+        image_cv,
+        f"Precision: {precision:.2f}",
+        (image_cv.shape[1] - 400, image_cv.shape[0] - 100),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.5,  # Font scale
+        (0, 0, 255),  # Red color in BGR
+        4,  # Line thickness
+    )
+    cv2.putText(
+        image_cv,
+        f"Recall: {recall:.2f}",
+        (image_cv.shape[1] - 400, image_cv.shape[0] - 50),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.5,  # Font scale
+        (0, 0, 255),  # Red color in BGR
+        4,  # Line thickness
+    )
+
+    # Save the image as PNG
+    output_filename = output_dir / f"val_set_{image_id:06d}.png"
+    cv2.imwrite(str(output_filename), image_cv)
+    print(f"Saved: {output_filename}")
+
+    return image_cv
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define list of models in ensemble
+
+list_models = []
+list_config = []
+list_cli_args = []
+for model_key in models_dict:
+    # Retrieve model config and CLI args from mlflow
+    trained_model_path = str(
+        models_dict[model_key] / "checkpoints" / "last.ckpt"
+    )
+
+    mlflow_params = read_mlflow_params(trained_model_path)
+    config = read_config_from_mlflow_params(mlflow_params)
+    cli_args = read_cli_args_from_mlflow_params(mlflow_params)
+
+    # ------------------------------------
+    # Load model
+    model = load_fasterrcnn_resnet50_fpn_v2(
+        trained_model_path,
+        num_classes=config["num_classes"],
+        device=None,  # device
+    )
+    model.eval()
+    list_models.append(model)
+    list_config.append(config)
+    list_cli_args.append(cli_args)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Check that all models have the same dataset config
+ref_config = list_config[0]
+for key in ["train_fraction", "val_over_test_fraction"]:
+    assert all(config[key] == ref_config[key] for config in list_config)
+
+ref_cli_args = list_cli_args[0]
+assert all(
+    cli_args["seed_n"] == ref_cli_args["seed_n"] for cli_args in list_cli_args
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define common dataset for ensemble
+
+# Define transforms for inference
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# Create COCO dataset
+dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=annotations_file_path,
+    composed_transform=inference_transforms,
+)
+
+# Split dataset like in crabs repo
+train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+    dataset_coco,
+    seed_n=ref_cli_args["seed_n"],
+    config=ref_config,  # only uses train_fraction and val_over_test_fraction
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Compute detections per model
+
+list_detections_ds = []
+for model in list_models:
+    model.to(device)
+    detections_ds = run_detector_on_dataset(
+        model=model,
+        dataset=val_dataset,
+        device=device,
+    )
+    detections_ds = add_bboxes_min_max_corners(detections_ds)  
+    list_detections_ds.append(detections_ds)
+
+
+# %%
+# Combine detections
+# can I avoid double loop?
+# should i use dataloader here?
+
+# Define parameters for WBF
+iou_thr_ensemble = 0.5
+skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+image_height, image_width,  = val_dataset[0][0].shape[-2:]
+
+list_image_ids = [annot["image_id"] for img, annot in val_dataset]
+
+
+detections_per_image_id = {}
+for image_id in list_image_ids:
+
+    # Get detections for current image across all models
+    list_ds_per_model = [ds.sel(image_id=image_id) for ds in list_detections_ds]
+
+    # Prepare inputs for WBF
+    list_nan_confidence = [ds.confidence.isnull() for ds in list_ds_per_model]
+
+    list_bboxes_x1y1_x2y2_norm = [
+        np.hstack(
+            [
+                ds["xy_min"].T / np.array([image_width, image_height]),
+                ds["xy_max"].T / np.array([image_width, image_height]),
+            ]
+        )[~slc_nan, :]  # remove nan annotations
+        for ds, slc_nan in zip(
+            list_ds_per_model, list_nan_confidence, strict=True
+        )
+    ]
+
+    list_scores = [
+        ds.confidence.to_numpy().T[~slc_nan]
+        for ds, slc_nan in zip(
+            list_ds_per_model, list_nan_confidence, strict=True
+        )
+    ]
+
+    list_labels = [
+        ds.label.to_numpy().T[~slc_nan]
+        for ds, slc_nan in zip(
+            list_ds_per_model, list_nan_confidence, strict=True
+        )
+    ]
+
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            list_bboxes_x1y1_x2y2_norm,  # n_models, n_predictions, 4
+            list_scores,  # n_models, n_predictions
+            list_labels,
+            iou_thr=iou_thr_ensemble,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2_norm), axis=1)
+    ensemble_x1y1_x2y2_norm = ensemble_x1y1_x2y2_norm[~slc_nan_rows]
+    ensemble_scores = ensemble_scores[~slc_nan_rows]
+    ensemble_labels = ensemble_labels[~slc_nan_rows]
+
+    # Undo normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        np.array([image_width, image_height]), (1, 2)
+    )
+
+    # Add to dict with key = image_id
+    detections_per_image_id[image_id] = {
+        "boxes": ensemble_x1y1_x2y2,
+        "scores": ensemble_scores,
+        "labels": ensemble_labels,
+    }
+
+# %%
+# Format as xarray dataset
+ensemble_detections_ds = _detections_per_image_id_as_ds(
+    detections_per_image_id
+)
+
+# %%
+# Evaluate detections with hungarian
+
+# ensemble_detections_ds = add_bboxes_min_max_corners(ensemble_detections_ds)
+
+# add tp, fp, tp_iou as arrays to dataset?
+# tp, fp, md, _ = evaluate_detections_hungarian(
+#         ensemble_x1_y1_x2_y2, annots["boxes"], iou_threshold_precision
+#     )
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# plot ensemble detections on first image
+
+# Get first image
+image_index = 25
+image = val_dataset[image_index][0]
+gt_annotations = val_dataset[image_index][1]
+
+
+plot_and_save_ensemble_detections(
+    image=image,
+    gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
+    pred_boxes_x1_y1_x2_y2=np.hstack(
+        [
+            ensemble_detections_ds[xy_corner_str].isel(image_id=image_index).values.T
+            for xy_corner_str in ["xy_min", "xy_max"]
+        ]
+    ),
+    pred_boxes_scores=ensemble_detections_ds.isel(
+        image_id=image_index
+    ).confidence.values,
+    image_id=gt_annotations["image_id"],
+    output_dir=Path.cwd(),
+    precision=0.0,
+    recall=0.0,
+)
+
+# %%
+# Combine detections with WBF
+# detections_ds = run_ensemble_of_detectors_on_dataset(
+#     list_models,
+#     dataset,  # could be list too
+#     device,   # ensure models and dataset are placed on this device?
+#     ensemble_boxes_method="wbf",
+#     **ensemble_boxes_kwargs,
+# )
+
+
+# detections_ds = run_ensemble_of_detectors_on_dataloader(
+#     list_models,
+#     dataset,  # could be list too
+#     device,   # ensure models and dataset are placed on this device?
+#     ensemble_boxes_method="wbf",
+#     **ensemble_boxes_kwargs,
+# )
+
+

From 29733f0f86f6e56dcccc497ed115dee788c31b47 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 25 Jul 2025 14:52:56 +0100
Subject: [PATCH 36/72] Fix import

---
 notebooks/notebook_run_ensemble_on_eval_dataset.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 1f6ca9bd..d2f76100 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -1,6 +1,4 @@
 # %%
-import json
-from datetime import datetime
 from pathlib import Path
 
 import cv2
@@ -11,13 +9,13 @@
 from ensemble_boxes import weighted_boxes_fusion
 from torch.utils.data import random_split
 
-from ethology.ethology.detectors.utils import add_bboxes_min_max_corners
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.inference import (
     _detections_per_image_id_as_ds,
     run_detector_on_dataset,
 )
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.detectors.utils import add_bboxes_min_max_corners
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
     read_config_from_mlflow_params,

From fd5014aa83a68174b5d343cb4583be1d41953875 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 28 Jul 2025 20:12:00 +0100
Subject: [PATCH 37/72] Concatenate detection datasets per image

---
 ethology/detectors/inference.py | 198 +++++++-------------------------
 ethology/detectors/utils.py     | 125 +++++++++++++++++++-
 2 files changed, 167 insertions(+), 156 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 1a4b028d..43a9b521 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -1,143 +1,25 @@
 """Inference utilities for detectors."""
 
-import numpy as np
+import pandas as pd
 import torch
-import xarray as xr
-
-
-def _pad_sequence_along_detections_dim(
-    array: np.ndarray, max_n_detections_per_image: int
-) -> tuple:
-    """Return sequence for padding input array along detections dimension."""
-    pad_detections_per_image = max_n_detections_per_image - array.shape[0]
-    return tuple(
-        (0, pad_detections_per_image) if i == 0 else (0, 0)
-        for i in range(array.ndim)
-    )
-
-
-def _detections_per_image_id_as_ds(
-    detections_per_image_id: dict,
-) -> xr.Dataset:
-    """Reshape detections per sample as xarray dataset."""
-    # Place any tensors on cpu if required
-    if any(
-        [
-            any(
-                isinstance(detections[key], torch.Tensor) for key in detections
-            )
-            for detections in detections_per_image_id.values()
-        ]
-    ):
-        detections_per_image_id = {
-            image_id: {
-                key: value.cpu().numpy()
-                if isinstance(value, torch.Tensor)
-                else value
-                for key, value in detections.items()
-            }
-            for image_id, detections in detections_per_image_id.items()
-        }
-
-    # Get coordinates
-    list_image_id_coords = list(detections_per_image_id.keys())
-    list_space_coords = ["x", "y"]
-    max_n_detections_per_image = max(
-        [
-            detections["boxes"].shape[0]
-            for detections in detections_per_image_id.values()
-        ]
-    )
-
-    list_id_coords = list(range(max_n_detections_per_image))  # per frame
-    coords_dict = {
-        "image_id": list_image_id_coords,
-        "space": list_space_coords,
-        "id": list_id_coords,  # per frame
-    }
-    coords_dict_no_space = coords_dict.copy()
-    del coords_dict_no_space["space"]
-
-    # Get lists of data arrays
-    list_centroid_arrays = [
-        (detections["boxes"][:, 0:2] + detections["boxes"][:, 2:4]) * 0.5
-        for detections in detections_per_image_id.values()
-    ]
-
-    list_shape_arrays = [
-        detections["boxes"][:, 2:4] - detections["boxes"][:, 0:2]
-        for detections in detections_per_image_id.values()
-    ]
-
-    list_confidence_arrays = [
-        detections["scores"]  # .reshape(-1, 1)
-        for detections in detections_per_image_id.values()
-    ]
-
-    list_label_arrays = [
-        detections["labels"]  # .reshape(-1, 1)
-        for detections in detections_per_image_id.values()
-    ]
-
-    # Define arrays to create
-    arrays_dict = {
-        "position": {  # --> before: centroids
-            "data": list_centroid_arrays,
-            "coords": coords_dict,
-            "pad_value": np.nan,
-        },
-        "shape": {
-            "data": list_shape_arrays,
-            "coords": coords_dict,
-            "pad_value": np.nan,
-        },
-        "confidence": {
-            "data": list_confidence_arrays,
-            "coords": coords_dict_no_space,
-            "pad_value": np.nan,
-        },
-        "label": {
-            "data": list_label_arrays,
-            "coords": coords_dict_no_space,
-            "pad_value": -1,
-        },
-    }
-
-    # Create all DataArrays in a loop
-    data_arrays = {}
-    for name in arrays_dict:
-        data_arrays[name] = xr.DataArray(
-            data=np.stack(
-                [
-                    np.pad(
-                        array,
-                        _pad_sequence_along_detections_dim(
-                            array, max_n_detections_per_image
-                        ),
-                        mode="constant",
-                        constant_values=arrays_dict[name]["pad_value"],
-                    ).T
-                    for array in arrays_dict[name]["data"]
-                ],
-                axis=0,  # need to pad with nans for constant shape
-            ),
-            dims=list(arrays_dict[name]["coords"].keys()),
-            coords=arrays_dict[name]["coords"],
-        )
-
-    return xr.Dataset(data_vars=data_arrays)
+
+from ethology.detectors.utils import (
+    concat_detections_ds,
+    detections_dict_as_ds,
+)
 
 
 def run_detector_on_dataset(
     model: torch.nn.Module,
     dataset: torch.utils.data.Dataset,  # dataloader instead?
     device: torch.device,
+    # store_sparse: bool = False,
 ) -> dict:
     """Run detection on each sample of a dataset.
 
     Note that the dataset transforms are applied to the sampled images.
-    The output is a dictionary with the detections per image_id as a dictionary.
-    The detections dictionary has the following keys:
+    The output is a dictionary with the detections per image_id as a
+    dictionary. The detections dictionary has the following keys:
     - "boxes": tensor of shape [N, 4]
     - "scores": tensor of shape [N]
     - "labels": tensor of shape [N]
@@ -146,7 +28,8 @@ def run_detector_on_dataset(
     model.eval()
 
     # Run detection for each sample in the dataset
-    detections_per_image_id = {}
+    list_detections_ds = []
+    list_image_ids = []
     for image, annotations in dataset:
         # Place image tensor on device and add batch dimension
         image = image.to(device)[None]  # [1, C, H, W]
@@ -154,20 +37,25 @@ def run_detector_on_dataset(
         with torch.no_grad():
             detections = model(image)
 
-        # Add to dict with key = image_id
+        # Format as xarray dataset
         # [0] to select single batch dimension
-        detections_per_image_id[annotations["image_id"]] = detections[0]
+        detections_ds = detections_dict_as_ds(detections[0])
+
+        # Append to list
+        list_detections_ds.append(detections_ds)
+        list_image_ids.append(annotations["image_id"])
 
-    # Format as xarray dataset
-    detections_dataset = _detections_per_image_id_as_ds(
-        detections_per_image_id
-    )
+    # Concatenate all detections datasets along image_id dimension
+    detections_dataset = concat_detections_ds(
+        list_detections_ds,
+        pd.Index(list_image_ids, name="image_id"),
+    )  # [image_id, model, annot_id]
 
     # Add image_width and image_height as attributes
     # (we assume all images in the dataset have the same width and height
     # as the last image)
-    detections_dataset.attrs["image_width"] = image.shape[-2]
-    detections_dataset.attrs["image_height"] = image.shape[-1]
+    detections_dataset.attrs["image_width"] = image.shape[-1]  # columns
+    detections_dataset.attrs["image_height"] = image.shape[-2]  # rows
 
     return detections_dataset
 
@@ -176,7 +64,7 @@ def run_detector_on_dataloader(
     model: torch.nn.Module,
     dataloader: torch.utils.data.DataLoader,
     device: torch.device,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> dict:
     """Run detection on a dataloader.
 
     The output is a dictionary with the detections per batch as a list.
@@ -211,28 +99,28 @@ def run_detector_on_dataloader(
     return detections_per_batch
 
 
-def collate_fn_varying_n_bboxes(batch: tuple) -> tuple:
-    """Collate function for dataloader with varying number of bounding boxes.
+# def collate_fn_varying_n_bboxes(batch: tuple) -> tuple:
+#     """Collate function for dataloader with varying number of bounding boxes.
 
-    A custom function is needed for detection
-    because the number of bounding boxes varies
-    between images of the same batch.
-    See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
+#     A custom function is needed for detection
+#     because the number of bounding boxes varies
+#     between images of the same batch.
+#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
 
-    Parameters
-    ----------
-    batch : tuple
-        a tuple of 2 tuples, the first one holding all images in the batch,
-        and the second one holding the corresponding annotations.
+#     Parameters
+#     ----------
+#     batch : tuple
+#         a tuple of 2 tuples, the first one holding all images in the batch,
+#         and the second one holding the corresponding annotations.
 
-    Returns
-    -------
-    tuple
-        a tuple of length = batch size, made up of (image, annotations)
-        tuples.
+#     Returns
+#     -------
+#     tuple
+#         a tuple of length = batch size, made up of (image, annotations)
+#         tuples.
 
-    """
-    return tuple(zip(*batch, strict=False))
+#     """
+#     return tuple(zip(*batch, strict=False))
 
 
 # def run_detector_on_image(
diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index 5abe4b2f..7ae0b582 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -1,8 +1,131 @@
 """Utility functions for transforming detection datasets."""
 
+import numpy as np
+import pandas as pd
+import torch
+import xarray as xr
+
+
+def concat_detections_ds(
+    list_detections_ds: list[xr.Dataset], index: pd.Index
+) -> xr.Dataset:
+    """Concatenate detections datasets along new dimension."""
+    # Check index has name
+    if index.name is None:
+        raise ValueError("Index must have a name")
+
+    # Concatenate along new dimension
+    ds = xr.concat(
+        list_detections_ds,
+        index,
+    )
+
+    # ensure "label" array is padded with -1 rather than nan
+    if "label" in ds.data_vars:
+        ds["label"] = ds.label.fillna(-1).astype(int)
+
+    return ds
+
+
+def detections_dict_as_ds(detections: dict) -> xr.Dataset:
+    """Reshape detections dictionaryas xarray dataset.
+
+    Input is detections dictionary with keys:
+    - "boxes": tensor of shape [N, 4], x1y1x2y2 in pixels
+    - "scores": tensor of shape [N]
+    - "labels": tensor of shape [N]
+
+    Output is xarray dataset with keys:
+    - "position": xarray.DataArray of shape [2, N] (space, annot_id)
+    - "shape": xarray.DataArray of shape [2, N] (space, annot_id)
+    - "confidence": xarray.DataArray of shape [N] (annot_id)
+    - "label": xarray.DataArray of shape [N] (annot_id)
+    """
+    # Place tensors on cpu if required & convert to numpy array
+    detections = {
+        key: value.cpu().numpy() if isinstance(value, torch.Tensor) else value
+        for key, value in detections.items()
+    }
+
+    return detections_x1y1_x2y2_as_ds(
+        detections["boxes"],
+        detections["scores"],
+        detections["labels"],
+    )
+
+
+def detections_x1y1_x2y2_as_ds(
+    x1y1_x2y2_array: np.ndarray,
+    scores_array: np.ndarray,
+    labels_array: np.ndarray,
+) -> xr.Dataset:
+    """Reshape detections array as xarray dataset.
+
+    Input is detections array with shape [N, 4], x1y1x2y2 in pixels
+    """
+    # Remove nan rows
+    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
+    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
+    scores_array = scores_array[~slc_nan_rows]
+    labels_array = labels_array[~slc_nan_rows]
+
+    # Create xarray dataset
+    n_detections = x1y1_x2y2_array.shape[0]
+    centroid_da = xr.DataArray(
+        data=0.5
+        * (
+            x1y1_x2y2_array[:, 0:2] + x1y1_x2y2_array[:, 2:4]
+        ).T,  # space, annot ID
+        dims=["space", "id"],
+        coords={
+            "space": ["x", "y"],
+            "id": list(range(n_detections)),
+        },
+    )
+
+    shape_da = xr.DataArray(
+        data=(
+            x1y1_x2y2_array[:, 2:4] - x1y1_x2y2_array[:, 0:2]
+        ).T,  # space, annot ID
+        dims=["space", "id"],
+        coords={
+            "space": ["x", "y"],
+            "id": list(range(n_detections)),
+        },
+    )
+
+    confidence_da = xr.DataArray(
+        data=scores_array,
+        dims=["id"],
+        coords={"id": list(range(n_detections))},
+    )
+
+    label_da = xr.DataArray(
+        data=labels_array,
+        dims=["id"],
+        coords={"id": list(range(n_detections))},
+    )
+
+    return xr.Dataset(
+        data_vars={
+            "position": centroid_da,
+            "shape": shape_da,
+            "confidence": confidence_da,
+            "label": label_da,
+        }
+    )
+
 
 def add_bboxes_min_max_corners(ds):
-    """Add xy_min and xy_max arrays to ds."""
+    """Add xy_min and xy_max arrays to ds.
+
+    # Compare to box_convert in testing?
+    box_convert(
+        torch.from_numpy(np.c_[ds.position.T, ds.shape.T]),
+        in_fmt="cxcywh",
+        out_fmt="xyxy",
+    )
+    """
     ds["xy_min"] = ds.position - 0.5 * ds.shape
     ds["xy_max"] = ds.position + 0.5 * ds.shape
     return ds

From f4f8705bc703ece5a25fbf292e8ae7eeca7a7414 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 28 Jul 2025 20:13:29 +0100
Subject: [PATCH 38/72] Exploring how to vectorise datasets/data arrays WIP

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 264 +++++++++++++++---
 1 file changed, 224 insertions(+), 40 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index d2f76100..e111e8e8 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -3,15 +3,17 @@
 
 import cv2
 import numpy as np
+import pandas as pd
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
 from ensemble_boxes import weighted_boxes_fusion
 from torch.utils.data import random_split
+from tqdm import tqdm
 
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.inference import (
-    _detections_per_image_id_as_ds,
+    concat_detections_ds,
     run_detector_on_dataset,
 )
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
@@ -251,69 +253,251 @@ def plot_and_save_ensemble_detections(
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Compute detections per model
-
+# Compute detections per model -- make it faster
+# can I vectorize this?
+# use dataloader instead?
 list_detections_ds = []
-for model in list_models:
+for model in tqdm(list_models):
     model.to(device)
     detections_ds = run_detector_on_dataset(
         model=model,
         dataset=val_dataset,
         device=device,
     )
-    detections_ds = add_bboxes_min_max_corners(detections_ds)  
+    detections_ds = add_bboxes_min_max_corners(detections_ds)
     list_detections_ds.append(detections_ds)
 
 
+# Concatenate detections across models
+all_models_detections_ds = concat_detections_ds(
+    list_detections_ds,
+    pd.Index(range(len(list_detections_ds)), name="model"),
+)
+
+
+# %%
+
+
+def wbf_arrays_one_img(
+    x1y1_x2y2_norm_one_img: xr.DataArray,  # ("model", "id", "space")
+    scores_one_img: xr.DataArray,
+    labels_one_img: xr.DataArray,
+    iou_thr_ensemble,
+    skip_box_thr,
+):
+    print(x1y1_x2y2_norm_one_img.data.shape)  # n_models, annot_id, space
+    print(scores_one_img.data.shape)  # n_models, annot_id
+    print(labels_one_img.data.shape)  # n_models, annot_id
+
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            x1y1_x2y2_norm_one_img,
+            scores_one_img,
+            labels_one_img,
+            iou_thr=iou_thr_ensemble,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    #
+
+    return xr.DataArray(
+        data=(ensemble_x1y1_x2y2_norm * np.tile(image_width_height, (1, 2))).T,
+        dims=["space", "id"],
+        coords={
+            "space": ["x", "y", "x", "y"],
+            "id": list(range(ensemble_x1y1_x2y2_norm.shape[0])),
+        },
+    )
+
+    # # Format as xarray dataset
+    # # Undo x1y1 x2y2 normalization!
+    # ensemble_detections_ds = detections_x1y1_x2y2_as_ds(
+    #     ensemble_x1y1_x2y2_norm * np.tile(image_width_height, (1, 2)),
+    #     ensemble_scores,
+    #     ensemble_labels,
+    # )
+
+    # return ensemble_detections_ds
+
+
+# %%
+iou_thr_ensemble = 0.5
+skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+
+sel_id = 193
+
+# compute x1y1_x2y2_norm
+image_width_height = np.array(
+    [
+        all_models_detections_ds.attrs["image_width"],
+        all_models_detections_ds.attrs["image_height"],
+    ]
+)
+x1y1x2y2_norm = (
+    xr.concat(
+        [
+            all_models_detections_ds["xy_min"],
+            all_models_detections_ds["xy_max"],
+        ],
+        dim="space",
+    )
+    / np.tile(image_width_height, (1, 2))[None, :, :, None]
+)  # model, image_id, 4, (annot) ID
+
+x1y1x2y2_ensemble = wbf_arrays_one_img(
+    x1y1x2y2_norm.sel(image_id=sel_id).transpose("model", "id", "space").data,
+    all_models_detections_ds.confidence.sel(image_id=sel_id).data,
+    all_models_detections_ds.label.sel(image_id=sel_id).data,
+    iou_thr_ensemble=iou_thr_ensemble,
+    skip_box_thr=skip_box_thr,
+)
+
+print(x1y1x2y2_ensemble.shape)
+
+
+# %%
+def test(
+    image_id,
+    x1y1x2y2_normalised,
+    confidence,
+    label,
+):
+    print(image_id)
+    print(image_id.shape)
+    print("---")
+    iou_thr_ensemble = 0.5
+    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+
+    x1y1x2y2_ensemble = wbf_arrays_one_img(
+        x1y1x2y2_normalised,  # .sel(image_id=image_id).transpose("model", "id", "space"),
+        confidence,
+        label,
+        iou_thr_ensemble=iou_thr_ensemble,
+        skip_box_thr=skip_box_thr,
+    )
+
+    print(x1y1x2y2_ensemble.shape)
+    # n_detections_in = x1y1x2y2_normalised.shape[1]
+    # print(n_detections_in)
+    print("<----->")
+
+    # return x1y1x2y2_ensemble 
+    # ---> could not broadcast input array from shape (4,115) into shape (4,112)
+
+
+    # To have constant output dimension
+    return np.pad(
+        x1y1x2y2_ensemble,
+        ((0, 0), (0, 300 - x1y1x2y2_ensemble.shape[1])),
+        "constant",
+        constant_values=np.nan,
+    )
+
+    # To have same dimensions as the input
+    # n_detections_in = x1y1x2y2_normalised.shape[1]
+    # n_detections_diff = (
+    #     x1y1x2y2_normalised.shape[1] - x1y1x2y2_ensemble.shape[1]
+    # )
+    # if n_detections_diff > 0:
+    #     return np.pad(
+    #         x1y1x2y2_ensemble,
+    #         ((0, 0), (0, n_detections_diff)),
+    #         "constant",
+    #         constant_values=np.nan,
+    #     )
+    # else:   
+    #     return x1y1x2y2_ensemble[:, :n_detections_in]
+    
+
+    
+# %%
+
+x1y1_x2y2_fused = xr.apply_ufunc(
+    test,
+    all_models_detections_ds.image_id,
+    x1y1x2y2_norm.transpose("model", "id", "space", "image_id"),
+    all_models_detections_ds.confidence.transpose("model", "id", "image_id"),
+    all_models_detections_ds.label.transpose("model", "id", "image_id"),
+    input_core_dims=[
+        [],
+        ["model", "id", "space"],
+        ["model", "id"],
+        ["model", "id"],
+    ],
+    output_core_dims=[["space", "id_out"]],
+    vectorize=True,
+    exclude_dims={"id"},  # to allow dimensions that change size btw input and output
+)
+
+
+print(x1y1_x2y2_fused.shape)  # image_id, 4, padded_id
+
+
+
+
+
+
+
+
+# # %%
+# ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+#     weighted_boxes_fusion(
+#         x1y1x2y2_norm.isel(image_id=0).transpose("model", "id", "space"),
+#         all_models_detections_ds.confidence.isel(image_id=0),  # "model", "id"
+#         all_models_detections_ds.label.isel(image_id=0),  # "model", "id"
+#         iou_thr=iou_thr_ensemble,
+#         skip_box_thr=skip_box_thr,
+#     )
+# )
+
+# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# # Run WBF for all image IDs
+# for image_id in all_models_detections_ds.image_id:
+#     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+#         weighted_boxes_fusion(
+#             x1y1x2y2_norm.isel(image_id=image_id).transpose(
+#                 "model", "id", "space"
+#             ),
+#         )
+#     )
+
+
 # %%
 # Combine detections
 # can I avoid double loop?
-# should i use dataloader here?
+# should i use dataloader here too?
 
 # Define parameters for WBF
 iou_thr_ensemble = 0.5
 skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
-image_height, image_width,  = val_dataset[0][0].shape[-2:]
 
-list_image_ids = [annot["image_id"] for img, annot in val_dataset]
+(image_height, image_width) = val_dataset[0][0].shape[-2:]
+image_height_width = np.array([image_width, image_height])
 
+list_image_ids = [annot["image_id"] for img, annot in val_dataset]
 
 detections_per_image_id = {}
 for image_id in list_image_ids:
-
     # Get detections for current image across all models
-    list_ds_per_model = [ds.sel(image_id=image_id) for ds in list_detections_ds]
-
-    # Prepare inputs for WBF
-    list_nan_confidence = [ds.confidence.isnull() for ds in list_ds_per_model]
-
-    list_bboxes_x1y1_x2y2_norm = [
-        np.hstack(
-            [
-                ds["xy_min"].T / np.array([image_width, image_height]),
-                ds["xy_max"].T / np.array([image_width, image_height]),
-            ]
-        )[~slc_nan, :]  # remove nan annotations
-        for ds, slc_nan in zip(
-            list_ds_per_model, list_nan_confidence, strict=True
-        )
-    ]
-
-    list_scores = [
-        ds.confidence.to_numpy().T[~slc_nan]
-        for ds, slc_nan in zip(
-            list_ds_per_model, list_nan_confidence, strict=True
-        )
+    detections_ds_per_model = [
+        ds.sel(image_id=image_id) for ds in list_detections_ds
     ]
 
-    list_labels = [
-        ds.label.to_numpy().T[~slc_nan]
-        for ds, slc_nan in zip(
-            list_ds_per_model, list_nan_confidence, strict=True
+    list_bboxes_x1y1_x2y2_norm = [
+        xr.concat(
+            [ds["xy_min"].T, ds["xy_max"].T],
+            dim="space",
         )
+        / np.tile(image_height_width, (1, 2))
+        for ds in detections_ds_per_model
     ]
+    list_scores = [ds.confidence.T for ds in detections_ds_per_model]
+    list_labels = [ds.label.T for ds in detections_ds_per_model]
 
     # Run WBF
+    # can I vectorize this across image_id?
     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
         weighted_boxes_fusion(
             list_bboxes_x1y1_x2y2_norm,  # n_models, n_predictions, 4
@@ -330,9 +514,9 @@ def plot_and_save_ensemble_detections(
     ensemble_scores = ensemble_scores[~slc_nan_rows]
     ensemble_labels = ensemble_labels[~slc_nan_rows]
 
-    # Undo normalization
+    # Undo x1y1 x2y2 normalization
     ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        np.array([image_width, image_height]), (1, 2)
+        image_height_width, (1, 2)
     )
 
     # Add to dict with key = image_id
@@ -373,7 +557,9 @@ def plot_and_save_ensemble_detections(
     gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
     pred_boxes_x1_y1_x2_y2=np.hstack(
         [
-            ensemble_detections_ds[xy_corner_str].isel(image_id=image_index).values.T
+            ensemble_detections_ds[xy_corner_str]
+            .isel(image_id=image_index)
+            .values.T
             for xy_corner_str in ["xy_min", "xy_max"]
         ]
     ),
@@ -404,5 +590,3 @@ def plot_and_save_ensemble_detections(
 #     ensemble_boxes_method="wbf",
 #     **ensemble_boxes_kwargs,
 # )
-
-

From b79d2d0e955bd993e4e59eae7cf93d4d7ea03132 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 12:39:30 +0100
Subject: [PATCH 39/72] Split detections ds formtting utils

---
 ethology/detectors/utils.py | 52 +++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index 7ae0b582..b30db5a9 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -27,6 +27,23 @@ def concat_detections_ds(
     return ds
 
 
+def detections_dict_as_ds_batch(
+    list_detections: list[dict],
+) -> list[xr.Dataset]:
+    """Reshape list of detections dictionaries as xarray dataset.
+
+    Input is list of detections dictionaries with keys:
+    - "boxes": tensor of shape [N, 4], x1y1x2y2 in pixels
+    - "scores": tensor of shape [N]
+    - "labels": tensor of shape [N]
+
+    Output is a list of xarray datasets, one for each image in the batch.
+    """
+    return [
+        detections_dict_as_ds(detections) for detections in list_detections
+    ]
+
+
 def detections_dict_as_ds(detections: dict) -> xr.Dataset:
     """Reshape detections dictionaryas xarray dataset.
 
@@ -54,21 +71,15 @@ def detections_dict_as_ds(detections: dict) -> xr.Dataset:
     )
 
 
-def detections_x1y1_x2y2_as_ds(
+def detections_x1y1_x2y2_as_da_tuple(
     x1y1_x2y2_array: np.ndarray,
     scores_array: np.ndarray,
     labels_array: np.ndarray,
-) -> xr.Dataset:
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
     """Reshape detections array as xarray dataset.
 
     Input is detections array with shape [N, 4], x1y1x2y2 in pixels
     """
-    # Remove nan rows
-    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
-    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
-    scores_array = scores_array[~slc_nan_rows]
-    labels_array = labels_array[~slc_nan_rows]
-
     # Create xarray dataset
     n_detections = x1y1_x2y2_array.shape[0]
     centroid_da = xr.DataArray(
@@ -106,6 +117,31 @@ def detections_x1y1_x2y2_as_ds(
         coords={"id": list(range(n_detections))},
     )
 
+    return centroid_da, shape_da, confidence_da, label_da
+
+
+def detections_x1y1_x2y2_as_ds(
+    x1y1_x2y2_array: np.ndarray,
+    scores_array: np.ndarray,
+    labels_array: np.ndarray,
+) -> xr.Dataset:
+    """Reshape detections array as xarray dataset.
+
+    Input is detections array with shape [N, 4], x1y1x2y2 in pixels
+    """
+    # Remove nan rows
+    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
+    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
+    scores_array = scores_array[~slc_nan_rows]
+    labels_array = labels_array[~slc_nan_rows]
+
+    # Create dataarrays for dataset
+    centroid_da, shape_da, confidence_da, label_da = (
+        detections_x1y1_x2y2_as_da_tuple(
+            x1y1_x2y2_array, scores_array, labels_array
+        )
+    )
+
     return xr.Dataset(
         data_vars={
             "position": centroid_da,

From 57a014812d05db1171d8e6e76dee5e8499cb6e83 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 12:40:05 +0100
Subject: [PATCH 40/72] Add run detector on dataloader

---
 ethology/detectors/inference.py | 83 +++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 34 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 43a9b521..15a00070 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -6,6 +6,7 @@
 from ethology.detectors.utils import (
     concat_detections_ds,
     detections_dict_as_ds,
+    detections_dict_as_ds_batch,
 )
 
 
@@ -76,51 +77,65 @@ def run_detector_on_dataloader(
     # Ensure model is in evaluation mode
     model.eval()
 
-    # Compute detections per batch
-    detections_per_batch = {}
-    for batch_idx, (image_batch, _annotations_batch) in enumerate(dataloader):
-        # Place batch of images on device
-        image_batch = [img.to(device) for img in image_batch]  # [B, C, H, W]
+    # Run detection for each sample in the dataset
+    list_detections_ds = []
+    list_image_ids = []
+    for image_batch, annotations_batch in dataloader:
+        # Place image batch on device
+        image_batch = tuple(image.to(device) for image in image_batch)
 
-        # Run detection
         with torch.no_grad():
-            detections_batch = model(
-                image_batch
-            )  # list of n-batch dictionaries
+            detections_batch = model(image_batch)
+
+        # Format as xarray dataset
+        # [0] to select single batch dimension
+        detections_ds_batch = detections_dict_as_ds_batch(detections_batch)
 
-        # Add to dict
-        detections_per_batch[batch_idx] = detections_batch
+        # Extend lists
+        list_detections_ds.extend(detections_ds_batch)
+        list_image_ids.extend(
+            [annot["image_id"] for annot in annotations_batch]
+        )
+
+    # Concatenate all detections datasets along image_id dimension
+    detections_dataset = concat_detections_ds(
+        list_detections_ds,
+        pd.Index(list_image_ids, name="image_id"),
+    )  # [image_id, model, annot_id]
 
-    # # Format as xarray dataset
-    # detections_dataset = _detections_per_image_id_as_ds(
-    #     detections_per_image_id
-    # )
+    # Add image_width and image_height as attributes
+    # (we assume all images in the dataset have the same width and height
+    # as the first image in the last batch)
+    detections_dataset.attrs["image_width"] = image_batch[0].shape[
+        -1
+    ]  # columns
+    detections_dataset.attrs["image_height"] = image_batch[0].shape[-2]  # rows
 
-    return detections_per_batch
+    return detections_dataset
 
 
-# def collate_fn_varying_n_bboxes(batch: tuple) -> tuple:
-#     """Collate function for dataloader with varying number of bounding boxes.
+def collate_fn_varying_n_bboxes(batch: tuple) -> tuple:
+    """Collate function for dataloader with varying number of bounding boxes.
 
-#     A custom function is needed for detection
-#     because the number of bounding boxes varies
-#     between images of the same batch.
-#     See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
+    A custom function is needed for detection
+    because the number of bounding boxes varies
+    between images of the same batch.
+    See https://pytorch.org/vision/main/auto_examples/transforms/plot_transforms_e2e.html#data-loading-and-training-loop
 
-#     Parameters
-#     ----------
-#     batch : tuple
-#         a tuple of 2 tuples, the first one holding all images in the batch,
-#         and the second one holding the corresponding annotations.
+    Parameters
+    ----------
+    batch : tuple
+        a tuple of 2 tuples, the first one holding all images in the batch,
+        and the second one holding the corresponding annotations.
 
-#     Returns
-#     -------
-#     tuple
-#         a tuple of length = batch size, made up of (image, annotations)
-#         tuples.
+    Returns
+    -------
+    tuple
+        a tuple of length = batch size, made up of (image, annotations)
+        tuples.
 
-#     """
-#     return tuple(zip(*batch, strict=False))
+    """
+    return tuple(zip(*batch, strict=True))
 
 
 # def run_detector_on_image(

From 16ef89866facfa4915b33ad8a69af865f2928c74 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 12:41:06 +0100
Subject: [PATCH 41/72] polish apply_ufunc approach

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 317 +++++++++---------
 1 file changed, 160 insertions(+), 157 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index e111e8e8..474ad820 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -8,16 +8,21 @@
 import torchvision.transforms.v2 as transforms
 import xarray as xr
 from ensemble_boxes import weighted_boxes_fusion
-from torch.utils.data import random_split
+from torch.utils.data import DataLoader, random_split
 from tqdm import tqdm
 
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.inference import (
+    collate_fn_varying_n_bboxes,
     concat_detections_ds,
-    run_detector_on_dataset,
+    run_detector_on_dataloader,
+    # run_detector_on_dataset,
 )
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
-from ethology.detectors.utils import add_bboxes_min_max_corners
+from ethology.detectors.utils import (
+    add_bboxes_min_max_corners,
+    detections_x1y1_x2y2_as_da_tuple,
+)
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
     read_config_from_mlflow_params,
@@ -251,17 +256,32 @@ def plot_and_save_ensemble_detections(
     config=ref_config,  # only uses train_fraction and val_over_test_fraction
 )
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define val dataloader
+# shuffle=False so that we dont shuffle the data after one pass over all batches
+val_dataloader = DataLoader(
+    val_dataset,
+    batch_size=ref_config["batch_size_val"],
+    shuffle=False,
+    num_workers=ref_config["num_workers"],
+    collate_fn=collate_fn_varying_n_bboxes,
+    persistent_workers=bool(ref_config["num_workers"] > 0),
+    # multiprocessing_context="fork"
+    # if ref_config["num_workers"] > 0 and torch.backends.mps.is_available()
+    # else None,  # see https://github.com/pytorch/pytorch/issues/87688
+)
+
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Compute detections per model -- make it faster
-# can I vectorize this?
-# use dataloader instead?
+# can I vectorize this? (pytorch forum question)
 list_detections_ds = []
 for model in tqdm(list_models):
     model.to(device)
-    detections_ds = run_detector_on_dataset(
+
+    detections_ds = run_detector_on_dataloader(
         model=model,
-        dataset=val_dataset,
+        dataloader=val_dataloader,
         device=device,
     )
     detections_ds = add_bboxes_min_max_corners(detections_ds)
@@ -274,194 +294,137 @@ def plot_and_save_ensemble_detections(
     pd.Index(range(len(list_detections_ds)), name="model"),
 )
 
+# Fuse detections across models
+# ....
 
-# %%
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
-def wbf_arrays_one_img(
-    x1y1_x2y2_norm_one_img: xr.DataArray,  # ("model", "id", "space")
-    scores_one_img: xr.DataArray,
-    labels_one_img: xr.DataArray,
-    iou_thr_ensemble,
-    skip_box_thr,
-):
-    print(x1y1_x2y2_norm_one_img.data.shape)  # n_models, annot_id, space
-    print(scores_one_img.data.shape)  # n_models, annot_id
-    print(labels_one_img.data.shape)  # n_models, annot_id
+def test(
+    x1y1: np.ndarray,  # model, annot, 4
+    x2y2: np.ndarray,  # model, annot, 4
+    confidence: np.ndarray,  # model, annot
+    label: np.ndarray,  # model, annot
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+    # print(x1y1.shape)
+    # print(x2y2.shape)
+    # print(confidence.shape)
+    # print(label.shape)
+    # print("---")
+
+    iou_thr_ensemble = 0.5
+    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+    max_n_detections = 300  # set a priori, max after fusing
+    image_width_height = np.array([4096, 2160])
 
+    x1y1x2y2_normalised = (
+        np.concat([x1y1, x2y2], axis=-1) / np.tile(image_width_height, (1, 2))
+    )[:, :, :, None]
+
+    # ------------------------------------
     # Run WBF
     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
         weighted_boxes_fusion(
-            x1y1_x2y2_norm_one_img,
-            scores_one_img,
-            labels_one_img,
+            x1y1x2y2_normalised,
+            confidence,
+            label,
             iou_thr=iou_thr_ensemble,
             skip_box_thr=skip_box_thr,
         )
     )
 
-    #
-
-    return xr.DataArray(
-        data=(ensemble_x1y1_x2y2_norm * np.tile(image_width_height, (1, 2))).T,
-        dims=["space", "id"],
-        coords={
-            "space": ["x", "y", "x", "y"],
-            "id": list(range(ensemble_x1y1_x2y2_norm.shape[0])),
-        },
+    # ------------------------------------
+    # Undo x1y1 x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
     )
 
-    # # Format as xarray dataset
-    # # Undo x1y1 x2y2 normalization!
-    # ensemble_detections_ds = detections_x1y1_x2y2_as_ds(
-    #     ensemble_x1y1_x2y2_norm * np.tile(image_width_height, (1, 2)),
-    #     ensemble_scores,
-    #     ensemble_labels,
-    # )
-
-    # return ensemble_detections_ds
-
-
-# %%
-iou_thr_ensemble = 0.5
-skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
-
-sel_id = 193
-
-# compute x1y1_x2y2_norm
-image_width_height = np.array(
-    [
-        all_models_detections_ds.attrs["image_width"],
-        all_models_detections_ds.attrs["image_height"],
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ensemble_x1y2_x2y2_scores_labels = np.c_[
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    ]
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ~slc_nan_rows
     ]
-)
-x1y1x2y2_norm = (
-    xr.concat(
-        [
-            all_models_detections_ds["xy_min"],
-            all_models_detections_ds["xy_max"],
-        ],
-        dim="space",
-    )
-    / np.tile(image_width_height, (1, 2))[None, :, :, None]
-)  # model, image_id, 4, (annot) ID
-
-x1y1x2y2_ensemble = wbf_arrays_one_img(
-    x1y1x2y2_norm.sel(image_id=sel_id).transpose("model", "id", "space").data,
-    all_models_detections_ds.confidence.sel(image_id=sel_id).data,
-    all_models_detections_ds.label.sel(image_id=sel_id).data,
-    iou_thr_ensemble=iou_thr_ensemble,
-    skip_box_thr=skip_box_thr,
-)
-
-print(x1y1x2y2_ensemble.shape)
-
-
-# %%
-def test(
-    image_id,
-    x1y1x2y2_normalised,
-    confidence,
-    label,
-):
-    print(image_id)
-    print(image_id.shape)
-    print("---")
-    iou_thr_ensemble = 0.5
-    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
 
-    x1y1x2y2_ensemble = wbf_arrays_one_img(
-        x1y1x2y2_normalised,  # .sel(image_id=image_id).transpose("model", "id", "space"),
-        confidence,
-        label,
-        iou_thr_ensemble=iou_thr_ensemble,
-        skip_box_thr=skip_box_thr,
+    # Pad array to max_n_detections
+    # To have constant output dimension (n_annotations)
+    ensemble_x1y2_x2y2_scores_labels = np.pad(
+        ensemble_x1y2_x2y2_scores_labels,
+        (
+            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+            (0, 0),
+        ),
+        "constant",
+        constant_values=np.nan,
     )
 
-    print(x1y1x2y2_ensemble.shape)
-    # n_detections_in = x1y1x2y2_normalised.shape[1]
-    # print(n_detections_in)
-    print("<----->")
+    # Format output as xarray dataarrays
+    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+        ensemble_x1y2_x2y2_scores_labels[:, 4],
+        ensemble_x1y2_x2y2_scores_labels[:, 5],
+    )
 
-    # return x1y1x2y2_ensemble 
-    # ---> could not broadcast input array from shape (4,115) into shape (4,112)
+    # print(centroid.shape)  # space, id
+    # print(shape.shape)  # space, id
+    # print("<----->")
 
+    return centroid, shape, confidence, label
 
-    # To have constant output dimension
-    return np.pad(
-        x1y1x2y2_ensemble,
-        ((0, 0), (0, 300 - x1y1x2y2_ensemble.shape[1])),
-        "constant",
-        constant_values=np.nan,
-    )
 
-    # To have same dimensions as the input
-    # n_detections_in = x1y1x2y2_normalised.shape[1]
-    # n_detections_diff = (
-    #     x1y1x2y2_normalised.shape[1] - x1y1x2y2_ensemble.shape[1]
-    # )
-    # if n_detections_diff > 0:
-    #     return np.pad(
-    #         x1y1x2y2_ensemble,
-    #         ((0, 0), (0, n_detections_diff)),
-    #         "constant",
-    #         constant_values=np.nan,
-    #     )
-    # else:   
-    #     return x1y1x2y2_ensemble[:, :n_detections_in]
-    
-
-    
 # %%
 
-x1y1_x2y2_fused = xr.apply_ufunc(
+centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
     test,
-    all_models_detections_ds.image_id,
-    x1y1x2y2_norm.transpose("model", "id", "space", "image_id"),
-    all_models_detections_ds.confidence.transpose("model", "id", "image_id"),
-    all_models_detections_ds.label.transpose("model", "id", "image_id"),
+    all_models_detections_ds.xy_min,
+    all_models_detections_ds.xy_max,
+    all_models_detections_ds.confidence,
+    all_models_detections_ds.label,
     input_core_dims=[
-        [],
-        ["model", "id", "space"],
+        ["model", "id", "space"],  # do not broadcast across these
+        ["model", "id", "space"],  # do not broadcast across these
         ["model", "id"],
         ["model", "id"],
     ],
-    output_core_dims=[["space", "id_out"]],
+    output_core_dims=[["space", "id"], ["space", "id"], ["id"], ["id"]],
     vectorize=True,
-    exclude_dims={"id"},  # to allow dimensions that change size btw input and output
+    # loop over non-core dims (i.e. image_id);
+    # assume `test` only takes arrays over core dims as input
+    exclude_dims={"id"},
+    # to allow dimensions that change size btw input and output
 )
 
 
-print(x1y1_x2y2_fused.shape)  # image_id, 4, padded_id
-
-
+# Remove excessive pad
+centroid_fused = centroid_fused.dropna(dim="id", how="all")
+shape_fused = shape_fused.dropna(dim="id", how="all")
+confidence_fused = confidence_fused.dropna(dim="id", how="all")
+label_fused = label_fused.dropna(dim="id", how="all")
 
+print(centroid_fused.shape)  # image_id, 2, padded_id
+print(shape_fused.shape)  # image_id, 2, padded_id
+print(confidence_fused.shape)  # image_id, padded_id
+print(label_fused.shape)  # image_id, padded_id
 
+# Pad labels with -1 rather than nan
+label_fused = label_fused.fillna(-1)
 
 
+# Can I return a dataset?
+fused_detections_ds = xr.Dataset(
+    data_vars={
+        "position": centroid_fused,
+        "shape": shape_fused,
+        "confidence": confidence_fused,
+        "label": label_fused,
+    }
+)
 
+print(fused_detections_ds)
 
-# # %%
-# ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-#     weighted_boxes_fusion(
-#         x1y1x2y2_norm.isel(image_id=0).transpose("model", "id", "space"),
-#         all_models_detections_ds.confidence.isel(image_id=0),  # "model", "id"
-#         all_models_detections_ds.label.isel(image_id=0),  # "model", "id"
-#         iou_thr=iou_thr_ensemble,
-#         skip_box_thr=skip_box_thr,
-#     )
-# )
-
-# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# # Run WBF for all image IDs
-# for image_id in all_models_detections_ds.image_id:
-#     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-#         weighted_boxes_fusion(
-#             x1y1x2y2_norm.isel(image_id=image_id).transpose(
-#                 "model", "id", "space"
-#             ),
-#         )
-#     )
+# %%
 
 
 # %%
@@ -519,6 +482,17 @@ def test(
         image_height_width, (1, 2)
     )
 
+    # apply nms?
+    # idcs_to_keep = torchvision.ops.nms(
+    #     ensemble_x1y1_x2y2,
+    #     ensemble_scores,
+    #     iou_threshold=0.9,
+    # )
+
+    # ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2[idcs_to_keep]
+    # ensemble_scores = ensemble_scores[idcs_to_keep]
+    # ensemble_labels = ensemble_labels[idcs_to_keep]
+
     # Add to dict with key = image_id
     detections_per_image_id[image_id] = {
         "boxes": ensemble_x1y1_x2y2,
@@ -573,7 +547,7 @@ def test(
 )
 
 # %%
-# Combine detections with WBF
+# # Combine detections with WBF
 # detections_ds = run_ensemble_of_detectors_on_dataset(
 #     list_models,
 #     dataset,  # could be list too
@@ -590,3 +564,32 @@ def test(
 #     ensemble_boxes_method="wbf",
 #     **ensemble_boxes_kwargs,
 # )
+
+
+# %%
+
+x1y1_x2y2_fused = xr.apply_ufunc(
+    test,
+    all_models_detections_ds.image_id,
+    x1y1x2y2_norm.transpose(
+        "model", "id", "space", "image_id"
+    ),  # place broadcast dims at the end
+    all_models_detections_ds.confidence.transpose("model", "id", "image_id"),
+    all_models_detections_ds.label.transpose("model", "id", "image_id"),
+    input_core_dims=[
+        [],  # do not exclude any dimensions
+        ["model", "id", "space"],  # do not broadcast across these
+        ["model", "id"],
+        ["model", "id"],
+    ],
+    output_core_dims=[["space", "id"]],
+    vectorize=True,  # loop over non-core dims
+    exclude_dims={
+        "id"
+    },  # to allow dimensions that change size btw input and output
+)
+
+
+print(x1y1_x2y2_fused.shape)  # image_id, 4, padded_id
+
+# Can I remove the excessive pad?

From e3fb6b5b161b3e19c9bd54ceebe348cd7577f33a Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 14:17:53 +0100
Subject: [PATCH 42/72] add naive approach and compare

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 118 ++++++++++++++----
 1 file changed, 93 insertions(+), 25 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 474ad820..75be1149 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -22,6 +22,7 @@
 from ethology.detectors.utils import (
     add_bboxes_min_max_corners,
     detections_x1y1_x2y2_as_da_tuple,
+    detections_x1y1_x2y2_as_ds,
 )
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
@@ -295,22 +296,90 @@ def plot_and_save_ensemble_detections(
 )
 
 # Fuse detections across models
-# ....
+# fused_detections_ds = combine_detections_across_models(all_models_detections_ds)
+
+# Evaluate
+
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Fuse detections across models -- Approach 1: naive
+# v clear but slow
+
+def wbf_wrapper(
+    ds: xr.Dataset,  
+) -> xr.Dataset:
+    """Wrapper for weighted boxes fusion."""
+
+    # Define parameters for WBF
+    iou_thr_ensemble = 0.5
+    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+    image_width_height = np.array([4096, 2160])
+
+    # Check ds has required dimensions
+    if "image_id" in ds.dims:
+        raise ValueError("Input dataset must not have image_id dimension")
+    if not all(s in ds.dims for s in ("model", "space", "id")):
+        raise ValueError(
+            "Input dataset must have model, space and id dimensions"
+        )
 
+    # Compute x1y1x2y2_normalised
+    x1y1x2y2_normalised = xr.concat(
+        [ds.xy_min, ds.xy_max], dim="space"
+    ) / np.tile(image_width_height, (1, 2))[:,:,None]
 
-def test(
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            x1y1x2y2_normalised.transpose("model", "id", "space"),
+            ds.confidence,
+            ds.label,
+            iou_thr=iou_thr_ensemble,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    # Undo x1y1, x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
+    )
+
+    # Format output as xarray dataarrays
+    fused_detections_ds = detections_x1y1_x2y2_as_ds(
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    )
+
+    return fused_detections_ds
+
+
+# %%
+#%timeit --> 9.09 s ± 55.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+# for every image_id slice
+list_fused_detections_ds = []
+for img_id in all_models_detections_ds.image_id:
+    
+    fused_detections_ds = wbf_wrapper(
+        all_models_detections_ds.sel(image_id=img_id)
+    )
+
+    list_fused_detections_ds.append(fused_detections_ds)
+
+# Concatenate fused detections across image_ids
+fused_detections_ds = concat_detections_ds(
+    list_fused_detections_ds,
+    pd.Index(range(len(list_fused_detections_ds)), name="image_id"),
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Fuse detections across models -- Approach 2: vectorized
+# faster but less clear
+
+def wbf_wrapper_arrays(
     x1y1: np.ndarray,  # model, annot, 4
     x2y2: np.ndarray,  # model, annot, 4
     confidence: np.ndarray,  # model, annot
     label: np.ndarray,  # model, annot
 ) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
-    # print(x1y1.shape)
-    # print(x2y2.shape)
-    # print(confidence.shape)
-    # print(label.shape)
-    # print("---")
 
     iou_thr_ensemble = 0.5
     skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
@@ -339,17 +408,19 @@ def test(
         image_width_height, (1, 2)
     )
 
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    # Combine x1y1, x2y2, scores and labels in one array
     ensemble_x1y2_x2y2_scores_labels = np.c_[
         ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
     ]
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
     ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
         ~slc_nan_rows
     ]
 
-    # Pad array to max_n_detections
-    # To have constant output dimension (n_annotations)
+    # Pad combinedarray to max_n_detections
+    # This is to have a constant output dimension in the `id` dimension
     ensemble_x1y2_x2y2_scores_labels = np.pad(
         ensemble_x1y2_x2y2_scores_labels,
         (
@@ -375,44 +446,41 @@ def test(
 
 
 # %%
-
+# timeit --- 1.37 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+# this will become a fn
 centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
-    test,
-    all_models_detections_ds.xy_min,
+    wbf_wrapper_arrays,
+    all_models_detections_ds.xy_min, # the underlaying .data array is passed
     all_models_detections_ds.xy_max,
     all_models_detections_ds.confidence,
     all_models_detections_ds.label,
-    input_core_dims=[
-        ["model", "id", "space"],  # do not broadcast across these
-        ["model", "id", "space"],  # do not broadcast across these
+    input_core_dims=[ # do not broadcast across these
+        ["model", "id", "space"],  
+        ["model", "id", "space"],  
         ["model", "id"],
         ["model", "id"],
     ],
     output_core_dims=[["space", "id"], ["space", "id"], ["id"], ["id"]],
     vectorize=True,
     # loop over non-core dims (i.e. image_id);
-    # assume `test` only takes arrays over core dims as input
+    # assumes function only takes arrays over core dims as input
     exclude_dims={"id"},
     # to allow dimensions that change size btw input and output
 )
 
 
-# Remove excessive pad
+# Remove pad across annotations
 centroid_fused = centroid_fused.dropna(dim="id", how="all")
 shape_fused = shape_fused.dropna(dim="id", how="all")
 confidence_fused = confidence_fused.dropna(dim="id", how="all")
 label_fused = label_fused.dropna(dim="id", how="all")
 
-print(centroid_fused.shape)  # image_id, 2, padded_id
-print(shape_fused.shape)  # image_id, 2, padded_id
-print(confidence_fused.shape)  # image_id, padded_id
-print(label_fused.shape)  # image_id, padded_id
 
 # Pad labels with -1 rather than nan
 label_fused = label_fused.fillna(-1)
 
 
-# Can I return a dataset?
+# Return a dataset
 fused_detections_ds = xr.Dataset(
     data_vars={
         "position": centroid_fused,
@@ -422,7 +490,7 @@ def test(
     }
 )
 
-print(fused_detections_ds)
+# print(fused_detections_ds)
 
 # %%
 

From 7ee6960d1b57a790103e39d6a7a566c43158a052 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:29:49 +0100
Subject: [PATCH 43/72] Exploring vectorising nms (not working)

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 402 +++++++-----------
 1 file changed, 159 insertions(+), 243 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 75be1149..8e80acdb 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -12,6 +12,7 @@
 from tqdm import tqdm
 
 from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.ensembles import combine_detections_across_models_wbf
 from ethology.detectors.inference import (
     collate_fn_varying_n_bboxes,
     concat_detections_ds,
@@ -274,7 +275,7 @@ def plot_and_save_ensemble_detections(
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Compute detections per model -- make it faster
+# Compute detections per model -- can I make it faster?
 # can I vectorize this? (pytorch forum question)
 list_detections_ds = []
 for model in tqdm(list_models):
@@ -295,285 +296,196 @@ def plot_and_save_ensemble_detections(
     pd.Index(range(len(list_detections_ds)), name="model"),
 )
 
-# Fuse detections across models
-# fused_detections_ds = combine_detections_across_models(all_models_detections_ds)
-
-# Evaluate
-
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Fuse detections across models -- Approach 1: naive
-# v clear but slow
-
-def wbf_wrapper(
-    ds: xr.Dataset,  
-) -> xr.Dataset:
-    """Wrapper for weighted boxes fusion."""
-
-    # Define parameters for WBF
-    iou_thr_ensemble = 0.5
-    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
-    image_width_height = np.array([4096, 2160])
-
-    # Check ds has required dimensions
-    if "image_id" in ds.dims:
-        raise ValueError("Input dataset must not have image_id dimension")
-    if not all(s in ds.dims for s in ("model", "space", "id")):
-        raise ValueError(
-            "Input dataset must have model, space and id dimensions"
-        )
-
-    # Compute x1y1x2y2_normalised
-    x1y1x2y2_normalised = xr.concat(
-        [ds.xy_min, ds.xy_max], dim="space"
-    ) / np.tile(image_width_height, (1, 2))[:,:,None]
-
-    # Run WBF
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-        weighted_boxes_fusion(
-            x1y1x2y2_normalised.transpose("model", "id", "space"),
-            ds.confidence,
-            ds.label,
-            iou_thr=iou_thr_ensemble,
-            skip_box_thr=skip_box_thr,
-        )
-    )
-
-    # Undo x1y1, x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_width_height, (1, 2)
-    )
-
-    # Format output as xarray dataarrays
-    fused_detections_ds = detections_x1y1_x2y2_as_ds(
-        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
-    )
-
-    return fused_detections_ds
+# Fuse detections across models
+fused_detections_ds = combine_detections_across_models_wbf(
+    all_models_detections_ds,
+    kwargs_wbf={
+        "iou_thr_ensemble": 0.5,
+        "skip_box_thr": 0.0001,
+        "max_n_detections": 300,
+    },
+)
 
 
 # %%
-#%timeit --> 9.09 s ± 55.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-# for every image_id slice
-list_fused_detections_ds = []
-for img_id in all_models_detections_ds.image_id:
-    
-    fused_detections_ds = wbf_wrapper(
-        all_models_detections_ds.sel(image_id=img_id)
+import torchvision
+
+
+def padded_batched_nms(
+    bboxes: torch.Tensor,
+    scores: torch.Tensor,
+    labels: torch.Tensor,
+    iou_threshold: float,
+) -> torch.Tensor:
+    print(bboxes.shape)
+    print(scores.shape)
+    print(labels.shape)
+
+    n_input_detections = bboxes.shape[0]
+    idcs_to_keep = torchvision.ops.batched_nms(
+        bboxes, scores, labels, iou_threshold
     )
 
-    list_fused_detections_ds.append(fused_detections_ds)
-
-# Concatenate fused detections across image_ids
-fused_detections_ds = concat_detections_ds(
-    list_fused_detections_ds,
-    pd.Index(range(len(list_fused_detections_ds)), name="image_id"),
-)
+    print(idcs_to_keep.shape)
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Fuse detections across models -- Approach 2: vectorized
-# faster but less clear
+    # # pad with -1
+    # idcs_to_keep = torch.nn.functional.pad(
+    #     idcs_to_keep,
+    #     (0, n_input_detections - idcs_to_keep.shape[0]),
+    #     value=-1,
+    # )
+    # print(idcs_to_keep)
+    return idcs_to_keep
 
-def wbf_wrapper_arrays(
-    x1y1: np.ndarray,  # model, annot, 4
-    x2y2: np.ndarray,  # model, annot, 4
-    confidence: np.ndarray,  # model, annot
-    label: np.ndarray,  # model, annot
-) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
 
-    iou_thr_ensemble = 0.5
-    skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
-    max_n_detections = 300  # set a priori, max after fusing
-    image_width_height = np.array([4096, 2160])
+# %%
 
-    x1y1x2y2_normalised = (
-        np.concat([x1y1, x2y2], axis=-1) / np.tile(image_width_height, (1, 2))
-    )[:, :, :, None]
+# Prepare input for nms
+fused_detections_ds = add_bboxes_min_max_corners(fused_detections_ds)
+ensemble_x1y1_x2y2 = xr.concat(
+    [
+        fused_detections_ds.xy_min,
+        fused_detections_ds.xy_max,
+    ],
+    dim="space",
+).transpose("image_id", "id", "space")
 
-    # ------------------------------------
-    # Run WBF
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-        weighted_boxes_fusion(
-            x1y1x2y2_normalised,
-            confidence,
-            label,
-            iou_thr=iou_thr_ensemble,
-            skip_box_thr=skip_box_thr,
-        )
-    )
 
-    # ------------------------------------
-    # Undo x1y1 x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_width_height, (1, 2)
+list_x1y1_x2y2_to_keep = []
+for image_id in range(ensemble_x1y1_x2y2.shape[0]):
+    idcs_to_keep = torchvision.ops.nms(
+        torch.from_numpy(ensemble_x1y1_x2y2.data[image_id]),
+        torch.from_numpy(fused_detections_ds.confidence.data[image_id]),
+        0.1,
     )
 
-    # Combine x1y1, x2y2, scores and labels in one array
-    ensemble_x1y2_x2y2_scores_labels = np.c_[
-        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
-    ]
+    list_x1y1_x2y2_to_keep.append(ensemble_x1y1_x2y2.data[image_id][idcs_to_keep,:])
+    print(idcs_to_keep.shape)
 
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
-    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
-        ~slc_nan_rows
-    ]
-
-    # Pad combinedarray to max_n_detections
-    # This is to have a constant output dimension in the `id` dimension
-    ensemble_x1y2_x2y2_scores_labels = np.pad(
-        ensemble_x1y2_x2y2_scores_labels,
-        (
-            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
-            (0, 0),
-        ),
-        "constant",
-        constant_values=np.nan,
-    )
 
-    # Format output as xarray dataarrays
-    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
-        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
-        ensemble_x1y2_x2y2_scores_labels[:, 4],
-        ensemble_x1y2_x2y2_scores_labels[:, 5],
+# %%
+for idx in range(ensemble_x1y1_x2y2.shape[0]):
+    print(torch.from_numpy(ensemble_x1y1_x2y2.data[idx]).shape)
+    print(torch.from_numpy(fused_detections_ds.confidence.data[idx]).shape)
+    print(sum(fused_detections_ds.label.data[idx] != -1))
+
+    out = torchvision.ops.batched_nms(
+        torch.from_numpy(ensemble_x1y1_x2y2.data[idx]),
+        torch.from_numpy(fused_detections_ds.confidence.data[idx]),
+        torch.from_numpy(fused_detections_ds.label.data[idx]),
+        0.1,
     )
-
-    # print(centroid.shape)  # space, id
-    # print(shape.shape)  # space, id
-    # print("<----->")
-
-    return centroid, shape, confidence, label
+    print(out.shape)
+    # out = torch.nn.functional.pad(
+    #     out,
+    #     (0, ensemble_x1y1_x2y2.data[idx].shape[0] - out.shape[0]),
+    #     value=-1,
+    # ) --- not neede with batched_nms?
+    # print(out.shape)
+    print("---")
 
 
 # %%
-# timeit --- 1.37 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-# this will become a fn
-centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
-    wbf_wrapper_arrays,
-    all_models_detections_ds.xy_min, # the underlaying .data array is passed
-    all_models_detections_ds.xy_max,
-    all_models_detections_ds.confidence,
-    all_models_detections_ds.label,
-    input_core_dims=[ # do not broadcast across these
-        ["model", "id", "space"],  
-        ["model", "id", "space"],  
-        ["model", "id"],
-        ["model", "id"],
-    ],
-    output_core_dims=[["space", "id"], ["space", "id"], ["id"], ["id"]],
-    vectorize=True,
-    # loop over non-core dims (i.e. image_id);
-    # assumes function only takes arrays over core dims as input
-    exclude_dims={"id"},
-    # to allow dimensions that change size btw input and output
-)
-
-
-# Remove pad across annotations
-centroid_fused = centroid_fused.dropna(dim="id", how="all")
-shape_fused = shape_fused.dropna(dim="id", how="all")
-confidence_fused = confidence_fused.dropna(dim="id", how="all")
-label_fused = label_fused.dropna(dim="id", how="all")
-
-
-# Pad labels with -1 rather than nan
-label_fused = label_fused.fillna(-1)
-
+# def padded_nms(
+#     bboxes: torch.Tensor,
+#     scores: torch.Tensor,
+#     iou_threshold: float,
+# ) -> torch.Tensor:
+#     print(bboxes.shape)
+#     print(scores.shape)
+
+#     out = torchvision.ops.nms(bboxes, scores, iou_threshold)
+#     out_padded = torch.nn.functional.pad(
+#         out,
+#         (0, bboxes.shape[0] - out.shape[0]),
+#         value=-1,
+#     )
+#     print(out_padded.shape)
+#     return out_padded
 
-# Return a dataset
-fused_detections_ds = xr.Dataset(
-    data_vars={
-        "position": centroid_fused,
-        "shape": shape_fused,
-        "confidence": confidence_fused,
-        "label": label_fused,
-    }
+vectorised_batched_nms = torch.vmap(
+    torchvision.ops.batched_nms,
+    in_dims=(0, 0, 0, None),
+)
+idcs_to_keep = vectorised_batched_nms(
+    torch.from_numpy(ensemble_x1y1_x2y2.data),
+    torch.from_numpy(fused_detections_ds.confidence.data),
+    torch.from_numpy(fused_detections_ds.label.data),
+    0.1,
 )
 
-# print(fused_detections_ds)
 
 # %%
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate
 
-# %%
-# Combine detections
-# can I avoid double loop?
-# should i use dataloader here too?
 
-# Define parameters for WBF
-iou_thr_ensemble = 0.5
-skip_box_thr = 0.0001  # skip boxes with confidence below this threshold
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Fuse detections across models -- Approach 2: vectorized
+# faster but less clear
 
-(image_height, image_width) = val_dataset[0][0].shape[-2:]
-image_height_width = np.array([image_width, image_height])
 
-list_image_ids = [annot["image_id"] for img, annot in val_dataset]
+# %%
+# timeit --- 1.37 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+# this will become a fn
+# centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
+#     wbf_wrapper_arrays,
+#     all_models_detections_ds.xy_min,  # the underlaying .data array is passed
+#     all_models_detections_ds.xy_max,
+#     all_models_detections_ds.confidence,
+#     all_models_detections_ds.label,
+#     kwargs={
+#         "image_width_height": np.array(
+#             [
+#                 all_models_detections_ds.attrs[img_size]
+#                 for img_size in ["image_width", "image_height"]
+#             ]
+#         ),
+#         "iou_thr_ensemble": 0.5,
+#         "skip_box_thr": 0.0001,
+#         "max_n_detections": 300,
+#     },
+#     input_core_dims=[  # do not broadcast across these
+#         ["model", "id", "space"],
+#         ["model", "id", "space"],
+#         ["model", "id"],
+#         ["model", "id"],
+#     ],
+#     output_core_dims=[["space", "id"], ["space", "id"], ["id"], ["id"]],
+#     vectorize=True,
+#     # loop over non-core dims (i.e. image_id);
+#     # assumes function only takes arrays over core dims as input
+#     exclude_dims={"id"},
+#     # to allow dimensions that change size btw input and output
+# )
 
-detections_per_image_id = {}
-for image_id in list_image_ids:
-    # Get detections for current image across all models
-    detections_ds_per_model = [
-        ds.sel(image_id=image_id) for ds in list_detections_ds
-    ]
 
-    list_bboxes_x1y1_x2y2_norm = [
-        xr.concat(
-            [ds["xy_min"].T, ds["xy_max"].T],
-            dim="space",
-        )
-        / np.tile(image_height_width, (1, 2))
-        for ds in detections_ds_per_model
-    ]
-    list_scores = [ds.confidence.T for ds in detections_ds_per_model]
-    list_labels = [ds.label.T for ds in detections_ds_per_model]
-
-    # Run WBF
-    # can I vectorize this across image_id?
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-        weighted_boxes_fusion(
-            list_bboxes_x1y1_x2y2_norm,  # n_models, n_predictions, 4
-            list_scores,  # n_models, n_predictions
-            list_labels,
-            iou_thr=iou_thr_ensemble,
-            skip_box_thr=skip_box_thr,
-        )
-    )
+# # Remove pad across annotations
+# centroid_fused = centroid_fused.dropna(dim="id", how="all")
+# shape_fused = shape_fused.dropna(dim="id", how="all")
+# confidence_fused = confidence_fused.dropna(dim="id", how="all")
+# label_fused = label_fused.dropna(dim="id", how="all")
 
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2_norm), axis=1)
-    ensemble_x1y1_x2y2_norm = ensemble_x1y1_x2y2_norm[~slc_nan_rows]
-    ensemble_scores = ensemble_scores[~slc_nan_rows]
-    ensemble_labels = ensemble_labels[~slc_nan_rows]
 
-    # Undo x1y1 x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_height_width, (1, 2)
-    )
+# # Pad labels with -1 rather than nan
+# label_fused = label_fused.fillna(-1)
 
-    # apply nms?
-    # idcs_to_keep = torchvision.ops.nms(
-    #     ensemble_x1y1_x2y2,
-    #     ensemble_scores,
-    #     iou_threshold=0.9,
-    # )
 
-    # ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2[idcs_to_keep]
-    # ensemble_scores = ensemble_scores[idcs_to_keep]
-    # ensemble_labels = ensemble_labels[idcs_to_keep]
+# # Return a dataset
+# fused_detections_ds = xr.Dataset(
+#     data_vars={
+#         "position": centroid_fused,
+#         "shape": shape_fused,
+#         "confidence": confidence_fused,
+#         "label": label_fused,
+#     }
+# )
 
-    # Add to dict with key = image_id
-    detections_per_image_id[image_id] = {
-        "boxes": ensemble_x1y1_x2y2,
-        "scores": ensemble_scores,
-        "labels": ensemble_labels,
-    }
+# print(fused_detections_ds)
 
 # %%
-# Format as xarray dataset
-ensemble_detections_ds = _detections_per_image_id_as_ds(
-    detections_per_image_id
-)
-
 # %%
 # Evaluate detections with hungarian
 
@@ -593,19 +505,20 @@ def wbf_wrapper_arrays(
 image = val_dataset[image_index][0]
 gt_annotations = val_dataset[image_index][1]
 
+fused_detections_ds_plot = add_bboxes_min_max_corners(fused_detections_ds)
 
 plot_and_save_ensemble_detections(
     image=image,
     gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
     pred_boxes_x1_y1_x2_y2=np.hstack(
         [
-            ensemble_detections_ds[xy_corner_str]
+            fused_detections_ds_plot[xy_corner_str]
             .isel(image_id=image_index)
             .values.T
             for xy_corner_str in ["xy_min", "xy_max"]
         ]
     ),
-    pred_boxes_scores=ensemble_detections_ds.isel(
+    pred_boxes_scores=fused_detections_ds_plot.isel(
         image_id=image_index
     ).confidence.values,
     image_id=gt_annotations["image_id"],
@@ -614,6 +527,9 @@ def wbf_wrapper_arrays(
     recall=0.0,
 )
 
+# %%
+
+
 # %%
 # # Combine detections with WBF
 # detections_ds = run_ensemble_of_detectors_on_dataset(

From 441a69fd98d085b4ae466f9c12298dc429a9a2f8 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:23:08 +0100
Subject: [PATCH 44/72] Load annotations as ds

---
 ethology/annotations/io/load_bboxes.py | 173 ++++++++++++++++++++++---
 1 file changed, 157 insertions(+), 16 deletions(-)

diff --git a/ethology/annotations/io/load_bboxes.py b/ethology/annotations/io/load_bboxes.py
index 210dfe05..3c6c62a1 100644
--- a/ethology/annotations/io/load_bboxes.py
+++ b/ethology/annotations/io/load_bboxes.py
@@ -4,7 +4,9 @@
 from pathlib import Path
 from typing import Literal
 
+import numpy as np
 import pandas as pd
+import xarray as xr
 
 from ethology.annotations.validators import ValidCOCO, ValidVIA
 
@@ -30,7 +32,7 @@ def from_files(
     format: Literal["VIA", "COCO"],
     images_dirs: Path | str | list[Path | str] | None = None,
 ) -> pd.DataFrame:
-    """Read input annotation files as a bboxes dataframe.
+    """Read input annotation files as a bboxes xarray dataset.
 
     Parameters
     ----------
@@ -44,17 +46,29 @@ def from_files(
 
     Returns
     -------
-    pd.DataFrame
-        Bounding boxes annotations dataframe. The dataframe is indexed
-        by "annotation_id" and has the following columns: "image_filename",
-        "image_id", "image_width", "image_height", "x_min", "y_min",
-        "width", "height", "supercategory", "category". It also has the
-        following attributes: "annotation_files", "annotation_format",
-        "images_directories". The "image_id" is assigned based
-        on the alphabetically sorted list of unique image filenames across all
-        input files. The "category_id" column is always a 0-based integer,
-        except for VIA files where the values specified in the input file
-        are retained.
+    xr.Dataset
+        Bounding boxes annotations xarray dataset. The dataset has the
+        following dimensions: "image_id", "space", "id".
+        The "image_id" is assigned based on the alphabetically sorted list
+        of unique image filenames across all input files. The "space"
+        dimension holds the "x" or "y" coordinates. The "id" dimension
+        corresponds to the annotation ID per image and it is assigned from
+        0 to the max number of annotations per image in the full dataset.
+
+        The dataset consists of three arrays:
+        - "position": (image_id, space, id)
+        - "shape": (image_id, space, id)
+        - "category": (image_id, id)
+        The "category" array holds 0-based integers, except for VIA
+        files where the values specified in the input file are retained.
+
+        The dataset attributes include:
+        - "map_category_id_to_category": map from category_id to category name
+        - "map_image_id_to_filename": map from image_id to image filename
+        - "images_directories": list of paths to the directories containing
+        the images the annotations refer to (optional)
+        - "annotation_files": list of paths to the input annotation files
+        - "annotation_format": format of the input annotation files
 
     Notes
     -----
@@ -66,7 +80,18 @@ def from_files(
     image IDs to images that have the same name but appear in different input
     annotation files, you can either make the image filenames distinct before
     loading the data, or you can load the data from each file
-    as a separate dataframe, and then concatenate them as desired.
+    as a separate xarray dataset, and then concatenate them as desired.
+
+    Examples
+    --------
+    >>> ds = from_files(
+    ...     file_paths=[
+    ...         "path/to/annotation_file_1.json",
+    ...         "path/to/annotation_file_2.json",
+    ...     ],
+    ...     format="VIA",
+    ...     images_dirs=["path/to/images_dir_1", "path/to/images_dir_2"],
+    ... )
 
     See Also
     --------
@@ -82,14 +107,33 @@ def from_files(
     else:
         df_all = _from_single_file(file_paths, format=format)
 
-    # Add metadata
-    df_all.attrs = {
+    # Get map from image_id to image_filename
+    mapping_df = df_all[["image_filename", "image_id"]].drop_duplicates()
+    map_image_id_to_filename = mapping_df.set_index("image_id").to_dict()[
+        "image_filename"
+    ]
+
+    # Get map from category_id to category
+    map_category_id_to_category = (
+        df_all[["category_id", "category"]]
+        .drop_duplicates()
+        .set_index("category_id")
+        .to_dict()["category"]
+    )
+
+    # Convert to xarray dataset
+    ds = _df_to_xarray_ds(df_all)
+
+    # Add metadata to the dataset
+    ds.attrs = {
         "annotation_files": file_paths,
         "annotation_format": format,
+        "map_category_id_to_category": map_category_id_to_category,
+        "map_image_id_to_filename": map_image_id_to_filename,
         "images_directories": images_dirs,
     }
 
-    return df_all
+    return ds
 
 
 def _from_multiple_files(
@@ -397,3 +441,100 @@ def _VIA_category_id_as_int(df: pd.DataFrame) -> pd.DataFrame:
     except ValueError:
         df["category_id"] = df["category"].factorize(sort=True)[0]
     return df
+
+
+def _df_to_xarray_ds(df: pd.DataFrame) -> xr.Dataset:
+    """Convert bounding boxes annotations dataframe to an xarray dataset.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Bounding boxes annotations dataframe.
+
+    Returns
+    -------
+    xr.Dataset
+        an xarray dataset with the following dimensions:
+        - "image_id": holds the 0-based index of the image in the "images"
+        list of the COCO JSON file;
+        - "space": "x" or "y";
+        - "id": annotation ID per image, assigned from 0 to the max number of
+        annotations per image in the full dataset.
+
+        The dataset is made up of the following arrays:
+        - position: (image_id, space, id)
+        - shape: (image_id, space, id)
+        - category: (image_id, id)
+
+    """
+    # Compute max number of annotations per image
+    max_annotations_per_image = df["image_id"].value_counts().max()
+
+    # Sort the dataframe by image_id
+    # Note: the input annotation ID is unique across the dataframe
+    df = df.sort_values(by=["image_id"])
+
+    # Compute indices of the rows where the image ID switches
+    bool_id_diff_from_prev = df["image_id"].ne(df["image_id"].shift())
+    indices_id_switch = np.argwhere(bool_id_diff_from_prev).squeeze()[1:]
+
+    # Stack position, shape and confidence arrays along ID axis
+    map_key_to_columns = {
+        "position_array": ["x_min", "y_min"],
+        "shape_array": ["width", "height"],
+        "category_array": ["category_id"],
+    }
+    map_key_to_padding = {
+        "position_array": (np.float64, np.nan),
+        "shape_array": (np.float64, np.nan),
+        "category_array": (int, -1),
+    }
+    array_dict = {}
+    for key in map_key_to_columns:
+        # extract annotations per image
+        list_arrays = np.split(
+            df[map_key_to_columns[key]].to_numpy(
+                dtype=map_key_to_padding[key][0]  # type: ignore
+            ),
+            indices_id_switch,  # indices along axis=0
+        )
+
+        # pad arrays with NaN values along the annotation ID axis
+        list_arrays_padded = [
+            np.pad(
+                arr,
+                ((0, max_annotations_per_image - arr.shape[0]), (0, 0)),
+                constant_values=map_key_to_padding[key][1],  # type: ignore
+            )
+            for arr in list_arrays
+        ]
+
+        # stack along the first axis (image_id)
+        array_dict[key] = np.stack(list_arrays_padded, axis=0).squeeze()
+
+        # reorder axes if required
+        if "category" not in key:
+            array_dict[key] = np.moveaxis(array_dict[key], -1, 1)
+
+    # ----
+    # Modify x_min and y_min to represent the bbox centre
+    array_dict["position_array"] += array_dict["shape_array"] / 2
+
+    # Create xarray dataset
+    return xr.Dataset(
+        data_vars=dict(
+            position=(
+                ["image_id", "space", "id"],
+                array_dict["position_array"],
+            ),
+            shape=(["image_id", "space", "id"], array_dict["shape_array"]),
+            category=(["image_id", "id"], array_dict["category_array"]),
+        ),
+        coords=dict(
+            image_id=df["image_id"].unique(),
+            space=["x", "y"],
+            id=range(max_annotations_per_image),
+            # annotation ID per frame; could be consistent across frames
+            # or not
+        ),
+    )

From 4f62cff7bc497fe454660444abd77dadf215b856 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:28:28 +0100
Subject: [PATCH 45/72] Add evaluate functions for ds

---
 ethology/detectors/evaluate.py | 178 ++++++++++++++++++++++++++++++---
 1 file changed, 162 insertions(+), 16 deletions(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index ec5df701..f03f4403 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -3,23 +3,100 @@
 import numpy as np
 import torch
 import torchvision.ops as ops
+import xarray as xr
 from scipy.optimize import linear_sum_assignment
 
+from ethology.detectors.utils import add_bboxes_min_max_corners
 
-def evaluate_detections_hungarian(
-    pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+
+def evaluate_detections_hungarian_ds(
+    pred_bboxes_ds: xr.Dataset,
+    gt_bboxes_ds: xr.Dataset,
+    iou_threshold: float,
+) -> tuple[xr.Dataset, xr.Dataset]:
     """Compute true positives, false positives, and missed detections.
 
     Uses Hungarian algorithm for matching.
+    """
+    # Add xy_min and xy_max if not present
+    if all(
+        [
+            var_str not in pred_bboxes_ds.variables
+            for var_str in ["xy_min", "xy_max"]
+        ]
+    ):
+        pred_bboxes_ds = add_bboxes_min_max_corners(pred_bboxes_ds)
+
+    if all(
+        [
+            var_str not in gt_bboxes_ds.variables
+            for var_str in ["xy_min", "xy_max"]
+        ]
+    ):
+        gt_bboxes_ds = add_bboxes_min_max_corners(gt_bboxes_ds)
+
+    # Prepare input for hungarian
+    pred_bboxes_x1y1_x2y2 = xr.concat(
+        [pred_bboxes_ds.xy_min, pred_bboxes_ds.xy_max], dim="space"
+    ).transpose("image_id", "id", "space")
+
+    gt_bboxes_x1y1_x2y2 = xr.concat(
+        [gt_bboxes_ds.xy_min, gt_bboxes_ds.xy_max], dim="space"
+    ).transpose("image_id", "id", "space")
+
+    # rename id dimension in gt_bboxes_x1y1_x2y2
+    gt_bboxes_x1y1_x2y2 = gt_bboxes_x1y1_x2y2.rename({"id": "id_gt"})
+
+    # Run hungarian vectorized
+    tp_array, fp_array, md_array, iou_tp_array = xr.apply_ufunc(
+        evaluate_detections_hungarian_arrays,
+        pred_bboxes_x1y1_x2y2,
+        gt_bboxes_x1y1_x2y2,
+        kwargs={"iou_threshold": iou_threshold},
+        input_core_dims=[
+            ["id", "space"],
+            ["id_gt", "space"],
+        ],
+        output_core_dims=[
+            ["id"],
+            ["id"],
+            ["id_gt"],
+            ["id"],
+        ],
+        vectorize=True,
+        exclude_dims={"id", "id_gt"},
+    )
+
+    # Add to datasets
+    pred_bboxes_ds["tp"] = xr.DataArray(tp_array, dims=["image_id", "id"])
+    pred_bboxes_ds["fp"] = xr.DataArray(fp_array, dims=["image_id", "id"])
+    pred_bboxes_ds["iou_tp"] = xr.DataArray(
+        iou_tp_array, dims=["image_id", "id"]
+    )
+
+    # rename id dimension in md_array
+    md_array = md_array.rename({"id_gt": "id"})
+    gt_bboxes_ds["md"] = xr.DataArray(md_array, dims=["image_id", "id"])
+
+    return pred_bboxes_ds, gt_bboxes_ds
+
+
+def evaluate_detections_hungarian_arrays(
+    pred_bboxes: np.ndarray, gt_bboxes: np.ndarray, iou_threshold: float
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Compute true positives, false positives, and missed detections.
+
+    Uses Hungarian algorithm for matching and takes arrays of bboxes as input
+    in x1y1x2y2 format.
 
     Parameters
     ----------
-    pred_bboxes : list
-        A list of prediction bounding boxes with the first four columns being
-        [x1, y1, x2, y2]
-    gt_bboxes : list
-        A list of ground truth bounding boxes with the first four columns being
+    pred_bboxes : np.ndarray
+        An array of prediction bounding boxes with the first four columns being
+        the coordinates of the bounding box in the format [x1, y1, x2, y2]
+    gt_bboxes : np.ndarray
+        An array of ground truth bounding boxes with the first four columns
+        being the coordinates of the bounding box in the format
         [x1, y1, x2, y2]
     iou_threshold : float
         IoU threshold for considering a detection as true positive
@@ -27,30 +104,46 @@ def evaluate_detections_hungarian(
     Returns
     -------
     tuple
-        A tuple of three boolean arrays:
+        A tuple of four boolean arrays:
         - true_positives: True for each predicted bbox that is a true positive
         - false_positives: True for each predicted bbox that is a false
         positive
         - missed_detections: True for each ground truth bbox that is missed
+        - true_positives_iou: IoU of each true positive
+
+    Notes
+    -----
+    The output arrays are padded with False to the length of the original
+    arrays. This means that for example where the true_positives array is
+    False, that does not necessarily mean that the prediction is a false
+    positive. The same applies for the true_positives_iou array, which is
+    padded with nan.
 
     """
+    # Remove nan values
+    n_pred_bboxes_padded = pred_bboxes.shape[0]
+    n_gt_bboxes_padded = gt_bboxes.shape[0]
+    pred_bboxes = pred_bboxes[~np.isnan(pred_bboxes).any(axis=1), :]
+    gt_bboxes = gt_bboxes[~np.isnan(gt_bboxes).any(axis=1), :]
+
     # Initialize output arrays
     true_positives = np.zeros(len(pred_bboxes), dtype=bool)
     false_positives = np.zeros(len(pred_bboxes), dtype=bool)
     matched_gts = np.zeros(len(gt_bboxes), dtype=bool)
     missed_detections = np.zeros(len(gt_bboxes), dtype=bool)  # unmatched gts
 
-    true_positives_iou = np.zeros(len(pred_bboxes), dtype=bool)
+    true_positives_iou = np.zeros(len(pred_bboxes), dtype=float)
 
     # cast as a tensor if not already
     if not isinstance(pred_bboxes, torch.Tensor):
-        pred_bboxes = torch.tensor(pred_bboxes, dtype=torch.float32)
+        pred_bboxes = torch.from_numpy(pred_bboxes).float()
     if not isinstance(gt_bboxes, torch.Tensor):
-        gt_bboxes = torch.tensor(gt_bboxes, dtype=torch.float32)
+        gt_bboxes = torch.from_numpy(gt_bboxes).float()
 
     if len(pred_bboxes) > 0 and len(gt_bboxes) > 0:
         # Compute IoU matrix (pred_bboxes x gt_bboxes)
         iou_matrix = ops.box_iou(pred_bboxes[:, :4], gt_bboxes).cpu().numpy()
+        # iou_matrix[np.isnan(iou_matrix)] = -np.inf
 
         # Use Hungarian algorithm to find optimal assignment
         pred_indices, gt_indices = linear_sum_assignment(
@@ -79,9 +172,62 @@ def evaluate_detections_hungarian(
         # No ground truth, all predictions are false positives
         false_positives[:] = True
 
-    return (
-        true_positives,
-        false_positives,
-        missed_detections,
+    # Pad tp, fp for pred_bboxes with False
+    tp_fp_pred_bboxes_padded: tuple[np.ndarray, ...] = ()
+    for output in [true_positives, false_positives]:
+        output_padded = np.pad(
+            output,
+            (0, n_pred_bboxes_padded - len(output)),
+            mode="constant",
+            constant_values=False,
+        )
+        tp_fp_pred_bboxes_padded += (output_padded,)
+
+    # Pad true_positives_iou for pred_bboxes with nan
+    true_positives_iou_padded = np.pad(
         true_positives_iou,
+        (0, n_pred_bboxes_padded - len(true_positives_iou)),
+        mode="constant",
+        constant_values=np.nan,
+    )
+
+    # Pad results for gt_bboxes with False
+    missed_detections_padded = np.pad(
+        missed_detections,
+        (0, n_gt_bboxes_padded - len(missed_detections)),
+        mode="constant",
+        constant_values=False,
     )
+    return tp_fp_pred_bboxes_padded + (
+        missed_detections_padded,
+        true_positives_iou_padded,
+    )
+
+
+def compute_precision_recall_ds(
+    fused_detections_ds: xr.Dataset,
+    gt_bboxes_ds: xr.Dataset,
+    iou_threshold: float,
+) -> tuple[xr.Dataset, xr.Dataset]:
+    """Compute precision and recall per image."""
+    # Compute true positives, false positives, and missed detections
+    fused_detections_ds, gt_bboxes_ds = evaluate_detections_hungarian_ds(
+        pred_bboxes_ds=fused_detections_ds,
+        gt_bboxes_ds=gt_bboxes_ds,
+        iou_threshold=iou_threshold,
+    )
+
+    # Compute precision and recall per image
+    precision_per_img = fused_detections_ds.tp.sum(dim="id") / (
+        fused_detections_ds.tp.sum(dim="id")
+        + fused_detections_ds.fp.sum(dim="id")
+    )
+    recall_per_img = fused_detections_ds.tp.sum(dim="id") / (
+        fused_detections_ds.tp.sum(dim="id") + gt_bboxes_ds.md.sum(dim="id")
+    )
+
+    # Add to datasets
+    fused_detections_ds["precision"] = precision_per_img
+    fused_detections_ds["recall"] = recall_per_img
+
+    return fused_detections_ds, gt_bboxes_ds

From 71a07a9fa51d7122c885736cd0bf918570754f6d Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:30:21 +0100
Subject: [PATCH 46/72] Add evaluation to notebook

---
 ethology/detectors/ensembles.py               | 248 ++++++++++++++++
 .../notebook_run_ensemble_on_eval_dataset.py  | 276 ++++++++----------
 2 files changed, 365 insertions(+), 159 deletions(-)
 create mode 100644 ethology/detectors/ensembles.py

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
new file mode 100644
index 00000000..3d9d4a2d
--- /dev/null
+++ b/ethology/detectors/ensembles.py
@@ -0,0 +1,248 @@
+"""Utils for ensembles of detectors."""
+
+import numpy as np
+import torch
+import torchvision
+import xarray as xr
+from ensemble_boxes import weighted_boxes_fusion
+
+from ethology.detectors.utils import (
+    add_bboxes_min_max_corners,
+    detections_x1y1_x2y2_as_da_tuple,
+)
+
+
+def wbf_wrapper_arrays(
+    bboxes_x1y1: np.ndarray,
+    bboxes_x2y2: np.ndarray,  # model, annot, 4
+    confidence: np.ndarray,  # model, annot
+    label: np.ndarray,  # model, annot
+    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
+    iou_thr_ensemble: float = 0.5,
+    skip_box_thr: float = 0.0001,
+    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+    """Wrap weighted boxes fusion to receive arrays as input.
+
+    Parameters
+    ----------
+    bboxes_x1y1: np.ndarray
+        Detected bounding boxes in a single imagein x1y1 format, with shape
+        n_models, n_annotations, 2.
+    bboxes_x2y2: np.ndarray
+        Detected bounding boxes in a single image in x2y2 format, with shape
+        n_models, n_annotations, 2.
+    confidence: np.ndarray
+        Confidence scores for each bounding box, with shape
+        n_models, n_annotations.
+    label: np.ndarray
+        Labels for each bounding box, with shape n_models, n_annotations.
+    image_width_height: np.ndarray
+        Width and height of the image, with shape 2.
+    iou_thr_ensemble: float
+        IoU threshold for detections to be considered for fusion.
+    skip_box_thr: float
+        Threshold for skipping boxes with confidence below this value.
+    max_n_detections: int
+        Fused bounding boxes arrays are padded to this total number of boxes.
+        Its value should be larger than the expected maximum number of detections
+        per image after fusing across models.
+
+    Returns
+    -------
+    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
+        Tuple of xr.DataArrays containing the fused detections. The arrays
+        are padded to max_n_detections and contain the data for the centroid,
+        shape, confidence and label of the fused detections.
+
+    """
+    # Prepare bboxes for WBF
+    bboxes_x1y1_x2y2_normalised = (
+        np.concat([bboxes_x1y1, bboxes_x2y2], axis=-1)
+        / np.tile(image_width_height, (1, 2))
+    )[:, :, :, None]
+
+    # ------------------------------------
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            bboxes_x1y1_x2y2_normalised,
+            confidence,
+            label,
+            iou_thr=iou_thr_ensemble,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    # ------------------------------------
+    # Undo x1y1 x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
+    )
+
+    # Combine x1y1, x2y2, scores and labels in one array
+    ensemble_x1y2_x2y2_scores_labels = np.c_[
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    ]
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ~slc_nan_rows
+    ]
+
+    # Pad combined array to max_n_detections
+    # (this is required to concatenate across image_ids
+    ensemble_x1y2_x2y2_scores_labels = np.pad(
+        ensemble_x1y2_x2y2_scores_labels,
+        (
+            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+            (0, 0),
+        ),
+        "constant",
+        constant_values=np.nan,
+    )
+
+    # Format output as xarray dataarrays
+    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+        ensemble_x1y2_x2y2_scores_labels[:, 4],
+        ensemble_x1y2_x2y2_scores_labels[:, 5],
+    )
+
+    return centroid, shape, confidence, label
+
+
+def combine_detections_across_models_wbf(
+    all_models_detections_ds: xr.Dataset,
+    kwargs_wbf: dict,
+) -> xr.Dataset:
+    """Combine detections across models using weighted boxes fusion.
+
+    Parameters
+    ----------
+    all_models_detections_ds: xr.Dataset
+        Dataset containing the detections from all models. It should contain
+        the following variables: xy_min, xy_max, confidence, label.
+    kwargs_wbf: dict
+        Keyword arguments for the weighted boxes fusion approach. It should
+        contain the following keys:
+        - iou_thr_ensemble: IoU threshold for detections to be considered for fusion.
+        - skip_box_thr: Threshold for skipping boxes with confidence below this value.
+        - max_n_detections: Fused bounding boxes arrays are padded to this total number of boxes.
+        Its value should be larger than the expected maximum number of detections
+        per image after fusing across models.
+
+    Returns
+    -------
+    xr.Dataset
+        Detections dataset containing the fused detections.
+    """
+    # Prepare kwargs
+    kwargs_wbf["image_width_height"] = np.array(
+        [
+            all_models_detections_ds.attrs[img_size]
+            for img_size in ["image_width", "image_height"]
+        ]
+    )
+
+    # Run WBF vectorized
+    centroid_fused, shape_fused, confidence_fused, label_fused = (
+        xr.apply_ufunc(
+            wbf_wrapper_arrays,
+            all_models_detections_ds.xy_min,  # the underlaying .data array is passed
+            all_models_detections_ds.xy_max,
+            all_models_detections_ds.confidence,
+            all_models_detections_ds.label,
+            kwargs=kwargs_wbf,
+            input_core_dims=[  # do not broadcast across these
+                ["model", "id", "space"],
+                ["model", "id", "space"],
+                ["model", "id"],
+                ["model", "id"],
+            ],
+            output_core_dims=[
+                ["space", "id"],
+                ["space", "id"],
+                ["id"],
+                ["id"],
+            ],
+            vectorize=True,
+            # loop over non-core dims (i.e. image_id);
+            # assumes function only takes arrays over core dims as input
+            exclude_dims={"id"},
+            # to allow dimensions that change size btw input and output
+        )
+    )
+
+    # Remove pad across annotations
+    centroid_fused = centroid_fused.dropna(dim="id", how="all")
+    shape_fused = shape_fused.dropna(dim="id", how="all")
+    confidence_fused = confidence_fused.dropna(dim="id", how="all")
+    label_fused = label_fused.dropna(dim="id", how="all")
+
+    # Pad labels with -1 rather than nan
+    label_fused = label_fused.fillna(-1).astype(int)
+
+    # Return a dataset
+    return xr.Dataset(
+        data_vars={
+            "position": centroid_fused,
+            "shape": shape_fused,
+            "confidence": confidence_fused,
+            "label": label_fused,
+        }
+    )
+
+
+# def apply_nms_to_detections_ds(
+#     detections_ds: xr.Dataset,
+#     nms_iou_threshold: float = 0.5,
+# ) -> xr.Dataset:
+#     """Apply non-maximum suppression to detections dataset."""
+
+#     def padded_batched_nms(
+#         bboxes: torch.Tensor,
+#         scores: torch.Tensor,
+#         labels: torch.Tensor,
+#         iou_threshold: float,
+#     ) -> torch.Tensor:
+#         n_input_detections = bboxes.shape[0]
+#         idcs_to_keep = torchvision.ops.batched_nms(
+#             bboxes, scores, labels, iou_threshold
+#         )
+#         # pad with -1
+#         idcs_to_keep = torch.nn.functional.pad(
+#             idcs_to_keep,
+#             (0, n_input_detections - idcs_to_keep.shape[0]),
+#             value=-1,
+#         )
+#         return idcs_to_keep
+
+#     # Add xy_min and xy_max if not present
+#     if all(
+#         [
+#             var_str not in detections_ds.variables
+#             for var_str in ["xy_min", "xy_max"]
+#         ]
+#     ):
+#         detections_ds = add_bboxes_min_max_corners(detections_ds)
+
+#     # Prepare input for nms
+#     ensemble_x1y1_x2y2 = xr.concat(
+#         [detections_ds.xy_min, detections_ds.xy_max], dim="space"
+#     ).transpose("image_id", "id", "space")
+
+#     # Apply nms
+#     nms_vectorized = torch.vmap(
+#         padded_batched_nms, in_dims=(0, 0, 0, None)
+#     )
+#     idcs_to_keep = nms_vectorized(
+#         torch.from_numpy(ensemble_x1y1_x2y2.data),
+#         torch.from_numpy(detections_ds.confidence.data),
+#         torch.from_numpy(detections_ds.label.data),
+#         nms_iou_threshold,
+#     )  # idcs per image, sorted by confidence
+
+#     # Return detections dataset with only the detections that are kept
+#     return detections_ds.sel(id=idcs_to_keep)
diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 8e80acdb..2685fb7e 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -7,12 +7,12 @@
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
-from ensemble_boxes import weighted_boxes_fusion
 from torch.utils.data import DataLoader, random_split
 from tqdm import tqdm
 
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.ensembles import combine_detections_across_models_wbf
+from ethology.detectors.evaluate import compute_precision_recall_ds
 from ethology.detectors.inference import (
     collate_fn_varying_n_bboxes,
     concat_detections_ds,
@@ -22,8 +22,6 @@
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
 from ethology.detectors.utils import (
     add_bboxes_min_max_corners,
-    detections_x1y1_x2y2_as_da_tuple,
-    detections_x1y1_x2y2_as_ds,
 )
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
@@ -307,194 +305,154 @@ def plot_and_save_ensemble_detections(
     },
 )
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define ground truth dataset
 
-# %%
-import torchvision
-
-
-def padded_batched_nms(
-    bboxes: torch.Tensor,
-    scores: torch.Tensor,
-    labels: torch.Tensor,
-    iou_threshold: float,
-) -> torch.Tensor:
-    print(bboxes.shape)
-    print(scores.shape)
-    print(labels.shape)
-
-    n_input_detections = bboxes.shape[0]
-    idcs_to_keep = torchvision.ops.batched_nms(
-        bboxes, scores, labels, iou_threshold
-    )
+from ethology.annotations.io import load_bboxes
 
-    print(idcs_to_keep.shape)
+print(annotations_file_path.name)
+# VIA_JSON_combined_coco_gen_sorted_imageIDs.json -->
+# image_id assigned by sorted filename from 0 to n-1
 
-    # # pad with -1
-    # idcs_to_keep = torch.nn.functional.pad(
-    #     idcs_to_keep,
-    #     (0, n_input_detections - idcs_to_keep.shape[0]),
-    #     value=-1,
-    # )
-    # print(idcs_to_keep)
-    return idcs_to_keep
+# read annotations as a dataset
+gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
+# fix category ID
+gt_bboxes_ds["category"] = gt_bboxes_ds["category"].where(
+    gt_bboxes_ds["category"] != 0, 1
+)
 
-# %%
+# select only image_id in val_dataset
+list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
+gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
-# Prepare input for nms
-fused_detections_ds = add_bboxes_min_max_corners(fused_detections_ds)
-ensemble_x1y1_x2y2 = xr.concat(
-    [
-        fused_detections_ds.xy_min,
-        fused_detections_ds.xy_max,
-    ],
-    dim="space",
-).transpose("image_id", "id", "space")
 
+# Alternatively: torch dataset into xarray dataset
+# .....
 
-list_x1y1_x2y2_to_keep = []
-for image_id in range(ensemble_x1y1_x2y2.shape[0]):
-    idcs_to_keep = torchvision.ops.nms(
-        torch.from_numpy(ensemble_x1y1_x2y2.data[image_id]),
-        torch.from_numpy(fused_detections_ds.confidence.data[image_id]),
-        0.1,
-    )
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate
 
-    list_x1y1_x2y2_to_keep.append(ensemble_x1y1_x2y2.data[image_id][idcs_to_keep,:])
-    print(idcs_to_keep.shape)
+fused_detections_ds, gt_bboxes_val_ds = compute_precision_recall_ds(
+    pred_bboxes_ds=fused_detections_ds,
+    gt_bboxes_ds=gt_bboxes_val_ds,
+    iou_threshold=0.5,
+)
 
 
-# %%
-for idx in range(ensemble_x1y1_x2y2.shape[0]):
-    print(torch.from_numpy(ensemble_x1y1_x2y2.data[idx]).shape)
-    print(torch.from_numpy(fused_detections_ds.confidence.data[idx]).shape)
-    print(sum(fused_detections_ds.label.data[idx] != -1))
-
-    out = torchvision.ops.batched_nms(
-        torch.from_numpy(ensemble_x1y1_x2y2.data[idx]),
-        torch.from_numpy(fused_detections_ds.confidence.data[idx]),
-        torch.from_numpy(fused_detections_ds.label.data[idx]),
-        0.1,
-    )
-    print(out.shape)
-    # out = torch.nn.functional.pad(
-    #     out,
-    #     (0, ensemble_x1y1_x2y2.data[idx].shape[0] - out.shape[0]),
-    #     value=-1,
-    # ) --- not neede with batched_nms?
-    # print(out.shape)
-    print("---")
 
 
-# %%
-# def padded_nms(
-#     bboxes: torch.Tensor,
-#     scores: torch.Tensor,
-#     iou_threshold: float,
-# ) -> torch.Tensor:
-#     print(bboxes.shape)
-#     print(scores.shape)
-
-#     out = torchvision.ops.nms(bboxes, scores, iou_threshold)
-#     out_padded = torch.nn.functional.pad(
-#         out,
-#         (0, bboxes.shape[0] - out.shape[0]),
-#         value=-1,
-#     )
-#     print(out_padded.shape)
-#     return out_padded
-
-vectorised_batched_nms = torch.vmap(
-    torchvision.ops.batched_nms,
-    in_dims=(0, 0, 0, None),
-)
-idcs_to_keep = vectorised_batched_nms(
-    torch.from_numpy(ensemble_x1y1_x2y2.data),
-    torch.from_numpy(fused_detections_ds.confidence.data),
-    torch.from_numpy(fused_detections_ds.label.data),
-    0.1,
-)
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate
+
+# fused_detections_ds, val_annotations_ds = evaluate_detections_hungarian_ds(
+#     pred_bboxes=fused_detections_ds,
+#     gt_bboxes=gt_bboxes_val_ds,
+#     iou_threshold=0.5,
+# )
+
+# # Add xy_min and xy_max if not present
+# if all(
+#     [
+#         var_str not in fused_detections_ds.variables
+#         for var_str in ["xy_min", "xy_max"]
+#     ]
+# ):
+#     fused_detections_ds = add_bboxes_min_max_corners(fused_detections_ds)
+
+# if all(
+#     [
+#         var_str not in gt_bboxes_val_ds.variables
+#         for var_str in ["xy_min", "xy_max"]
+#     ]
+# ):
+#     gt_bboxes_val_ds = add_bboxes_min_max_corners(gt_bboxes_val_ds)
 
 
 # %%
+# Prepare input for hungarian
+# pred_bboxes_x1y1_x2y2 = xr.concat(
+#     [fused_detections_ds.xy_min, fused_detections_ds.xy_max], dim="space"
+# ).transpose("image_id", "id", "space")
 
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate
+# # Prepare input for hungarian
+# gt_bboxes_x1y1_x2y2 = xr.concat(
+#     [gt_bboxes_val_ds.xy_min, gt_bboxes_val_ds.xy_max], dim="space"
+# ).transpose("image_id", "id", "space")
 
 
+# # rename id dimension in gt_bboxes_x1y1_x2y2
+# gt_bboxes_x1y1_x2y2 = gt_bboxes_x1y1_x2y2.rename({"id": "id_gt"})
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Fuse detections across models -- Approach 2: vectorized
-# faster but less clear
+# Run hungarian one image
+# OJO False values in arrays are "unreliable"; always use True values
+# print(pred_bboxes_x1y1_x2y2.data[0].shape)
+# print(gt_bboxes_x1y1_x2y2.data[0].shape)
+# tp, fp, md, iou_tp = evaluate_detections_hungarian_arrays(
+#     pred_bboxes_x1y1_x2y2.data[0],
+#     gt_bboxes_x1y1_x2y2.data[0],
+#     iou_threshold=0.5,
+# )
 
+# print("---")
+# print(tp.shape)
+# print(fp.shape)
+# print(iou_tp.shape)
+# print(md.shape)
 
-# %%
-# timeit --- 1.37 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-# this will become a fn
-# centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
-#     wbf_wrapper_arrays,
-#     all_models_detections_ds.xy_min,  # the underlaying .data array is passed
-#     all_models_detections_ds.xy_max,
-#     all_models_detections_ds.confidence,
-#     all_models_detections_ds.label,
-#     kwargs={
-#         "image_width_height": np.array(
-#             [
-#                 all_models_detections_ds.attrs[img_size]
-#                 for img_size in ["image_width", "image_height"]
-#             ]
-#         ),
-#         "iou_thr_ensemble": 0.5,
-#         "skip_box_thr": 0.0001,
-#         "max_n_detections": 300,
-#     },
-#     input_core_dims=[  # do not broadcast across these
-#         ["model", "id", "space"],
-#         ["model", "id", "space"],
-#         ["model", "id"],
-#         ["model", "id"],
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# # Run hungarian vectorized
+
+
+# # def test(pred_bboxes_x1y1_x2y2, gt_bboxes_x1y1_x2y2, iou_threshold):
+# #     print(pred_bboxes_x1y1_x2y2.shape)
+# #     print(gt_bboxes_x1y1_x2y2.shape)
+# #     print(iou_threshold)
+# #     print('----')
+# #     return evaluate_detections_hungarian_arrays(
+# #         pred_bboxes_x1y1_x2y2,
+# #         gt_bboxes_x1y1_x2y2,
+# #         iou_threshold,
+# #     )
+
+
+# tp_array, fp_array, md_array, iou_tp_array = xr.apply_ufunc(
+#     evaluate_detections_hungarian_arrays,
+#     pred_bboxes_x1y1_x2y2,
+#     gt_bboxes_x1y1_x2y2,
+#     kwargs={"iou_threshold": 0.5},
+#     input_core_dims=[
+#         ["id", "space"],
+#         ["id_gt", "space"],
+#     ],
+#     output_core_dims=[
+#         ["id"],
+#         ["id"],
+#         ["id_gt"],
+#         ["id"],
 #     ],
-#     output_core_dims=[["space", "id"], ["space", "id"], ["id"], ["id"]],
 #     vectorize=True,
-#     # loop over non-core dims (i.e. image_id);
-#     # assumes function only takes arrays over core dims as input
-#     exclude_dims={"id"},
-#     # to allow dimensions that change size btw input and output
+#     exclude_dims={"id", "id_gt"},
 # )
 
 
-# # Remove pad across annotations
-# centroid_fused = centroid_fused.dropna(dim="id", how="all")
-# shape_fused = shape_fused.dropna(dim="id", how="all")
-# confidence_fused = confidence_fused.dropna(dim="id", how="all")
-# label_fused = label_fused.dropna(dim="id", how="all")
-
-
-# # Pad labels with -1 rather than nan
-# label_fused = label_fused.fillna(-1)
-
-
-# # Return a dataset
-# fused_detections_ds = xr.Dataset(
-#     data_vars={
-#         "position": centroid_fused,
-#         "shape": shape_fused,
-#         "confidence": confidence_fused,
-#         "label": label_fused,
-#     }
+# # %%
+# # Add to dataset
+# fused_detections_ds["tp"] = xr.DataArray(tp_array, dims=["image_id", "id"])
+# fused_detections_ds["fp"] = xr.DataArray(fp_array, dims=["image_id", "id"])
+# fused_detections_ds["iou_tp"] = xr.DataArray(
+#     iou_tp_array, dims=["image_id", "id"]
 # )
 
-# print(fused_detections_ds)
 
-# %%
-# %%
-# Evaluate detections with hungarian
+# # rename id dimension in md_array
+# md_array = md_array.rename({"id_gt": "id"})
+# gt_bboxes_val_ds["md"] = xr.DataArray(md_array, dims=["image_id", "id"])
 
-# ensemble_detections_ds = add_bboxes_min_max_corners(ensemble_detections_ds)
 
-# add tp, fp, tp_iou as arrays to dataset?
-# tp, fp, md, _ = evaluate_detections_hungarian(
-#         ensemble_x1_y1_x2_y2, annots["boxes"], iou_threshold_precision
-#     )
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

From f525f34cd44c5333643abd5e56f74a780985fc2a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 29 Jul 2025 18:30:51 +0000
Subject: [PATCH 47/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ethology/detectors/ensembles.py                    | 4 +---
 notebooks/notebook_run_ensemble_on_eval_dataset.py | 3 ---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
index 3d9d4a2d..5dc1ee8c 100644
--- a/ethology/detectors/ensembles.py
+++ b/ethology/detectors/ensembles.py
@@ -1,13 +1,10 @@
 """Utils for ensembles of detectors."""
 
 import numpy as np
-import torch
-import torchvision
 import xarray as xr
 from ensemble_boxes import weighted_boxes_fusion
 
 from ethology.detectors.utils import (
-    add_bboxes_min_max_corners,
     detections_x1y1_x2y2_as_da_tuple,
 )
 
@@ -137,6 +134,7 @@ def combine_detections_across_models_wbf(
     -------
     xr.Dataset
         Detections dataset containing the fused detections.
+
     """
     # Prepare kwargs
     kwargs_wbf["image_width_height"] = np.array(
diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 2685fb7e..8a1e5480 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -340,8 +340,6 @@ def plot_and_save_ensemble_detections(
 )
 
 
-
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Evaluate
 
@@ -454,7 +452,6 @@ def plot_and_save_ensemble_detections(
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # plot ensemble detections on first image
 

From 3fd024e7232ae2659be2c75619c40f53471d4b39 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 20:08:29 +0100
Subject: [PATCH 48/72] Clean up ensemble notebook

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 183 +-----------------
 1 file changed, 9 insertions(+), 174 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 2685fb7e..9f76d021 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -10,6 +10,7 @@
 from torch.utils.data import DataLoader, random_split
 from tqdm import tqdm
 
+from ethology.annotations.io import load_bboxes
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.ensembles import combine_detections_across_models_wbf
 from ethology.detectors.evaluate import compute_precision_recall_ds
@@ -308,7 +309,6 @@ def plot_and_save_ensemble_detections(
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define ground truth dataset
 
-from ethology.annotations.io import load_bboxes
 
 print(annotations_file_path.name)
 # VIA_JSON_combined_coco_gen_sorted_imageIDs.json -->
@@ -336,202 +336,37 @@ def plot_and_save_ensemble_detections(
 fused_detections_ds, gt_bboxes_val_ds = compute_precision_recall_ds(
     pred_bboxes_ds=fused_detections_ds,
     gt_bboxes_ds=gt_bboxes_val_ds,
-    iou_threshold=0.5,
+    iou_threshold=0.1, # change to 0.5?
 )
 
 
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate
-
-# fused_detections_ds, val_annotations_ds = evaluate_detections_hungarian_ds(
-#     pred_bboxes=fused_detections_ds,
-#     gt_bboxes=gt_bboxes_val_ds,
-#     iou_threshold=0.5,
-# )
-
-# # Add xy_min and xy_max if not present
-# if all(
-#     [
-#         var_str not in fused_detections_ds.variables
-#         for var_str in ["xy_min", "xy_max"]
-#     ]
-# ):
-#     fused_detections_ds = add_bboxes_min_max_corners(fused_detections_ds)
-
-# if all(
-#     [
-#         var_str not in gt_bboxes_val_ds.variables
-#         for var_str in ["xy_min", "xy_max"]
-#     ]
-# ):
-#     gt_bboxes_val_ds = add_bboxes_min_max_corners(gt_bboxes_val_ds)
-
-
-# %%
-# Prepare input for hungarian
-# pred_bboxes_x1y1_x2y2 = xr.concat(
-#     [fused_detections_ds.xy_min, fused_detections_ds.xy_max], dim="space"
-# ).transpose("image_id", "id", "space")
-
-# # Prepare input for hungarian
-# gt_bboxes_x1y1_x2y2 = xr.concat(
-#     [gt_bboxes_val_ds.xy_min, gt_bboxes_val_ds.xy_max], dim="space"
-# ).transpose("image_id", "id", "space")
-
-
-# # rename id dimension in gt_bboxes_x1y1_x2y2
-# gt_bboxes_x1y1_x2y2 = gt_bboxes_x1y1_x2y2.rename({"id": "id_gt"})
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run hungarian one image
-# OJO False values in arrays are "unreliable"; always use True values
-# print(pred_bboxes_x1y1_x2y2.data[0].shape)
-# print(gt_bboxes_x1y1_x2y2.data[0].shape)
-# tp, fp, md, iou_tp = evaluate_detections_hungarian_arrays(
-#     pred_bboxes_x1y1_x2y2.data[0],
-#     gt_bboxes_x1y1_x2y2.data[0],
-#     iou_threshold=0.5,
-# )
-
-# print("---")
-# print(tp.shape)
-# print(fp.shape)
-# print(iou_tp.shape)
-# print(md.shape)
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# # Run hungarian vectorized
-
-
-# # def test(pred_bboxes_x1y1_x2y2, gt_bboxes_x1y1_x2y2, iou_threshold):
-# #     print(pred_bboxes_x1y1_x2y2.shape)
-# #     print(gt_bboxes_x1y1_x2y2.shape)
-# #     print(iou_threshold)
-# #     print('----')
-# #     return evaluate_detections_hungarian_arrays(
-# #         pred_bboxes_x1y1_x2y2,
-# #         gt_bboxes_x1y1_x2y2,
-# #         iou_threshold,
-# #     )
-
-
-# tp_array, fp_array, md_array, iou_tp_array = xr.apply_ufunc(
-#     evaluate_detections_hungarian_arrays,
-#     pred_bboxes_x1y1_x2y2,
-#     gt_bboxes_x1y1_x2y2,
-#     kwargs={"iou_threshold": 0.5},
-#     input_core_dims=[
-#         ["id", "space"],
-#         ["id_gt", "space"],
-#     ],
-#     output_core_dims=[
-#         ["id"],
-#         ["id"],
-#         ["id_gt"],
-#         ["id"],
-#     ],
-#     vectorize=True,
-#     exclude_dims={"id", "id_gt"},
-# )
-
-
-# # %%
-# # Add to dataset
-# fused_detections_ds["tp"] = xr.DataArray(tp_array, dims=["image_id", "id"])
-# fused_detections_ds["fp"] = xr.DataArray(fp_array, dims=["image_id", "id"])
-# fused_detections_ds["iou_tp"] = xr.DataArray(
-#     iou_tp_array, dims=["image_id", "id"]
-# )
-
-
-# # rename id dimension in md_array
-# md_array = md_array.rename({"id_gt": "id"})
-# gt_bboxes_val_ds["md"] = xr.DataArray(md_array, dims=["image_id", "id"])
-
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# plot ensemble detections on first image
+# plot ensemble detections on a selected image
 
 # Get first image
-image_index = 25
+image_index = 0
 image = val_dataset[image_index][0]
 gt_annotations = val_dataset[image_index][1]
 
-fused_detections_ds_plot = add_bboxes_min_max_corners(fused_detections_ds)
+# fused_detections_ds_plot = add_bboxes_min_max_corners(fused_detections_ds)
 
 plot_and_save_ensemble_detections(
     image=image,
     gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
     pred_boxes_x1_y1_x2_y2=np.hstack(
         [
-            fused_detections_ds_plot[xy_corner_str]
+            fused_detections_ds[xy_corner_str]
             .isel(image_id=image_index)
             .values.T
             for xy_corner_str in ["xy_min", "xy_max"]
         ]
     ),
-    pred_boxes_scores=fused_detections_ds_plot.isel(
+    pred_boxes_scores=fused_detections_ds.isel(
         image_id=image_index
     ).confidence.values,
     image_id=gt_annotations["image_id"],
     output_dir=Path.cwd(),
-    precision=0.0,
-    recall=0.0,
-)
-
-# %%
-
-
-# %%
-# # Combine detections with WBF
-# detections_ds = run_ensemble_of_detectors_on_dataset(
-#     list_models,
-#     dataset,  # could be list too
-#     device,   # ensure models and dataset are placed on this device?
-#     ensemble_boxes_method="wbf",
-#     **ensemble_boxes_kwargs,
-# )
-
-
-# detections_ds = run_ensemble_of_detectors_on_dataloader(
-#     list_models,
-#     dataset,  # could be list too
-#     device,   # ensure models and dataset are placed on this device?
-#     ensemble_boxes_method="wbf",
-#     **ensemble_boxes_kwargs,
-# )
-
-
-# %%
-
-x1y1_x2y2_fused = xr.apply_ufunc(
-    test,
-    all_models_detections_ds.image_id,
-    x1y1x2y2_norm.transpose(
-        "model", "id", "space", "image_id"
-    ),  # place broadcast dims at the end
-    all_models_detections_ds.confidence.transpose("model", "id", "image_id"),
-    all_models_detections_ds.label.transpose("model", "id", "image_id"),
-    input_core_dims=[
-        [],  # do not exclude any dimensions
-        ["model", "id", "space"],  # do not broadcast across these
-        ["model", "id"],
-        ["model", "id"],
-    ],
-    output_core_dims=[["space", "id"]],
-    vectorize=True,  # loop over non-core dims
-    exclude_dims={
-        "id"
-    },  # to allow dimensions that change size btw input and output
+    precision=fused_detections_ds.isel(image_id=image_index).precision.values,
+    recall=fused_detections_ds.isel(image_id=image_index).recall.values,
 )
 
-
-print(x1y1_x2y2_fused.shape)  # image_id, 4, padded_id
-
-# Can I remove the excessive pad?

From 6405a0665a95697af98bc8f03d25b78d974154ad Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Tue, 29 Jul 2025 20:09:18 +0100
Subject: [PATCH 49/72] Update binned notebook

---
 notebooks/notebook_evaluate_binned_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index f96e81c8..caf1fd43 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -17,7 +17,7 @@
 from torch.utils.data import random_split
 
 from ethology.datasets.create import create_coco_dataset
-from ethology.detectors.evaluate import evaluate_detections_hungarian
+from ethology.detectors.evaluate import evaluate_detections_hungarian_arrays
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
     read_config_from_mlflow_params,
@@ -117,7 +117,7 @@ def compute_pred_gt_tables(iou_threshold, ds_predictions, val_dataset):
         gt_bboxes = annotations["boxes"].cpu().numpy()
 
         # Evaluate detections
-        tp, fp, md, _ = evaluate_detections_hungarian(
+        tp, fp, md, _ = evaluate_detections_hungarian_arrays(
             pred_bboxes, gt_bboxes, iou_threshold
         )
 

From 95e553bc4f57ceb351b20e4323d16f5e750d3708 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 29 Jul 2025 19:13:40 +0000
Subject: [PATCH 50/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 notebooks/notebook_run_ensemble_on_eval_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 07652bc1..fa547d0f 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -259,7 +259,7 @@ def plot_and_save_ensemble_detections(
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define val dataloader
-# shuffle=False so that we dont shuffle the data 
+# shuffle=False so that we dont shuffle the data
 # after one pass over all batches
 val_dataloader = DataLoader(
     val_dataset,

From 9aecac8b16f925d80d8763fc2220c0354dff9556 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 30 Jul 2025 12:21:15 +0100
Subject: [PATCH 51/72] Convert torch dataset to detections dataset

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 62 ++++++++++++++++++-
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 07652bc1..8ee18bb0 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -11,6 +11,7 @@
 from tqdm import tqdm
 
 from ethology.annotations.io import load_bboxes
+from ethology.datasets.convert import torch_dataset_to_xr_dataset
 from ethology.datasets.create import create_coco_dataset
 from ethology.detectors.ensembles import combine_detections_across_models_wbf
 from ethology.detectors.evaluate import compute_precision_recall_ds
@@ -259,7 +260,7 @@ def plot_and_save_ensemble_detections(
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define val dataloader
-# shuffle=False so that we dont shuffle the data 
+# shuffle=False so that we dont shuffle the data
 # after one pass over all batches
 val_dataloader = DataLoader(
     val_dataset,
@@ -274,6 +275,60 @@ def plot_and_save_ensemble_detections(
 )
 
 
+# %%
+list_annot = [annot for img, annot in val_dataset]
+
+
+df_annot = pd.DataFrame(list_annot)
+df_annot["centroid"] = df_annot["boxes"].apply(
+    lambda x: (0.5 * (x[:, 0:2] + x[:, 2:4])).numpy().astype(float)
+)
+df_annot["shape"] = df_annot["boxes"].apply(
+    lambda x: (x[:, 2:4] - x[:, 0:2]).numpy().astype(float)
+)
+df_annot["labels"] = df_annot["labels"].apply(
+    lambda x: x.numpy().reshape(-1, 1).astype(int)
+)
+
+df_annot["n_annotations"] = df_annot["boxes"].apply(lambda x: x.shape[0])
+n_max_annotations_per_image = df_annot["n_annotations"].max()
+
+# %%
+array_dict = {}
+map_name_to_padding = {
+    "centroid": np.nan,
+    "shape": np.nan,
+    "labels": -1,
+}
+for array_name in map_name_to_padding:
+    array_dict[array_name] = np.stack(
+        [
+            np.pad(
+                arr,
+                ((0, n_max_annotations_per_image - arr.shape[0]), (0, 0)),
+                mode="constant",
+                constant_values=map_name_to_padding[array_name],
+            ).T
+            for arr in df_annot[array_name].to_list()
+        ]
+    )
+
+
+# %%
+val_ds = xr.Dataset(
+    data_vars={
+        "position": (["image_id", "space", "id"], array_dict["centroid"]),
+        "shape": (["image_id", "space", "id"], array_dict["shape"]),
+        "label": (["image_id", "id"], array_dict["labels"].squeeze()),
+    },
+    coords={
+        "image_id": df_annot["image_id"].values,
+        "space": ["x", "y"],
+        "id": range(n_max_annotations_per_image),
+    },
+)
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Compute detections per model -- can I make it faster?
 # can I vectorize this? (pytorch forum question)
@@ -327,10 +382,13 @@ def plot_and_save_ensemble_detections(
 list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
 gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
-
+# %%
 # Alternatively: torch dataset into xarray dataset
 # .....
 
+
+assert gt_bboxes_val_ds.equals(val_ds)
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Evaluate
 

From b9e4a3cc1a883e4d75d800fce8a598ee7c954570 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 30 Jul 2025 16:09:49 +0100
Subject: [PATCH 52/72] Clean up and add adhoc test

---
 ethology/datasets/convert.py                  | 103 +++++++++++++++++
 .../notebook_run_ensemble_on_eval_dataset.py  | 109 ++++++++----------
 2 files changed, 148 insertions(+), 64 deletions(-)
 create mode 100644 ethology/datasets/convert.py

diff --git a/ethology/datasets/convert.py b/ethology/datasets/convert.py
new file mode 100644
index 00000000..7c83ca4d
--- /dev/null
+++ b/ethology/datasets/convert.py
@@ -0,0 +1,103 @@
+"""Convert betweendataset formats."""
+
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import torch
+import xarray as xr
+
+
+def torch_dataset_to_xr_dataset(
+    torch_dataset: torch.utils.data.Dataset,
+) -> xr.Dataset:
+    """Convert a torch dataset to an xarray detections dataset."""
+    # Read list of annotations as a dataframe
+    list_annot = [annot for _img, annot in torch_dataset]
+    df_annot = pd.DataFrame(list_annot)
+
+    # Compute centroid, shape and labels
+    df_annot["centroid"] = df_annot["boxes"].apply(
+        lambda x: (0.5 * (x[:, 0:2] + x[:, 2:4])).numpy().astype(float)
+    )
+    df_annot["shape"] = df_annot["boxes"].apply(
+        lambda x: (x[:, 2:4] - x[:, 0:2]).numpy().astype(float)
+    )
+    df_annot["labels"] = df_annot["labels"].apply(
+        lambda x: x.numpy().reshape(-1, 1).astype(int)
+    )
+
+    # Compute maximum number of annotations per image
+    df_annot["n_annotations"] = df_annot["boxes"].apply(lambda x: x.shape[0])
+    n_max_annotations = df_annot["n_annotations"].max()
+
+    # Pad arrays to n_max_annotations
+    array_dict = {}
+    map_name_to_padding = {
+        "centroid": np.nan,
+        "shape": np.nan,
+        "labels": -1,
+    }
+    for array_name in map_name_to_padding:
+        array_dict[array_name] = np.stack(
+            [
+                np.pad(
+                    arr,
+                    ((0, n_max_annotations - arr.shape[0]), (0, 0)),
+                    mode="constant",
+                    constant_values=map_name_to_padding[array_name],
+                ).T
+                for arr in df_annot[array_name].to_list()
+            ]
+        )
+
+    # Return xarray dataset
+    xr_dataset = xr.Dataset(
+        data_vars={
+            "position": (["image_id", "space", "id"], array_dict["centroid"]),
+            "shape": (["image_id", "space", "id"], array_dict["shape"]),
+            "category": (["image_id", "id"], array_dict["labels"].squeeze()),
+        },
+        coords={
+            "image_id": df_annot["image_id"].values,
+            "space": ["x", "y"],
+            "id": range(n_max_annotations),
+        },
+    )
+
+    # Add metadata
+    root = find_nested_root(torch_dataset)
+    if root:
+        xr_dataset.attrs["images_directories"] = root
+
+    return xr_dataset
+
+
+def find_nested_root(dataset: torch.utils.data.Dataset) -> str | Path | None:
+    """Find root of a possibly nested dataset.
+
+    Parameters
+    ----------
+    dataset : torch.utils.data.Dataset
+        The dataset to check. It may be the result of multiple
+        splits, and therefore be nested.
+
+    Returns
+    -------
+    str or Path or None
+        The nested root value for the dataset, or None if not found
+
+    """
+    current = dataset
+
+    # Check current level
+    if hasattr(current, "root"):
+        return current
+
+    # Check through dataset levels
+    while hasattr(current, "dataset"):
+        current = current.dataset
+        if hasattr(current, "root"):
+            return current.root
+
+    return None
diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 8ee18bb0..86812c31 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -274,61 +274,6 @@ def plot_and_save_ensemble_detections(
     # else None,  # see https://github.com/pytorch/pytorch/issues/87688
 )
 
-
-# %%
-list_annot = [annot for img, annot in val_dataset]
-
-
-df_annot = pd.DataFrame(list_annot)
-df_annot["centroid"] = df_annot["boxes"].apply(
-    lambda x: (0.5 * (x[:, 0:2] + x[:, 2:4])).numpy().astype(float)
-)
-df_annot["shape"] = df_annot["boxes"].apply(
-    lambda x: (x[:, 2:4] - x[:, 0:2]).numpy().astype(float)
-)
-df_annot["labels"] = df_annot["labels"].apply(
-    lambda x: x.numpy().reshape(-1, 1).astype(int)
-)
-
-df_annot["n_annotations"] = df_annot["boxes"].apply(lambda x: x.shape[0])
-n_max_annotations_per_image = df_annot["n_annotations"].max()
-
-# %%
-array_dict = {}
-map_name_to_padding = {
-    "centroid": np.nan,
-    "shape": np.nan,
-    "labels": -1,
-}
-for array_name in map_name_to_padding:
-    array_dict[array_name] = np.stack(
-        [
-            np.pad(
-                arr,
-                ((0, n_max_annotations_per_image - arr.shape[0]), (0, 0)),
-                mode="constant",
-                constant_values=map_name_to_padding[array_name],
-            ).T
-            for arr in df_annot[array_name].to_list()
-        ]
-    )
-
-
-# %%
-val_ds = xr.Dataset(
-    data_vars={
-        "position": (["image_id", "space", "id"], array_dict["centroid"]),
-        "shape": (["image_id", "space", "id"], array_dict["shape"]),
-        "label": (["image_id", "id"], array_dict["labels"].squeeze()),
-    },
-    coords={
-        "image_id": df_annot["image_id"].values,
-        "space": ["x", "y"],
-        "id": range(n_max_annotations_per_image),
-    },
-)
-
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Compute detections per model -- can I make it faster?
 # can I vectorize this? (pytorch forum question)
@@ -358,19 +303,15 @@ def plot_and_save_ensemble_detections(
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
         "skip_box_thr": 0.0001,
-        "max_n_detections": 300,
+        "max_n_detections": 300,  # set default?
     },
 )
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define ground truth dataset
 
-
-print(annotations_file_path.name)
-# VIA_JSON_combined_coco_gen_sorted_imageIDs.json -->
-# image_id assigned by sorted filename from 0 to n-1
-
 # read annotations as a dataset
+print(annotations_file_path.name)
 gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
 # fix category ID
@@ -379,15 +320,55 @@ def plot_and_save_ensemble_detections(
 )
 
 # select only image_id in val_dataset
-list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
+# Note that the max number of annotations per image in the val_dataset
+# will stay as in the original dataset (also, category = -1 is not considered
+# an empty value for xarrays .dropna())
+list_image_ids_val = [annot["image_id"] for img, annot in val_dataset] 
 gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
+
+# # %%
+# list_image_ids_test = [annot["image_id"] for img, annot in test_dataset]
+# list_image_ids_train = [annot["image_id"] for img, annot in train_dataset]
+
+# gt_bboxes_test_ds = gt_bboxes_ds.sel(image_id=list_image_ids_test)
+# gt_bboxes_train_ds = gt_bboxes_ds.sel(image_id=list_image_ids_train)
+
 # %%
-# Alternatively: torch dataset into xarray dataset
+# Alternatively: convert torch dataset into xarray detections dataset
 # .....
 
+val_ds = torch_dataset_to_xr_dataset(val_dataset)  # max -- annotations per image
+test_ds = torch_dataset_to_xr_dataset(test_dataset)  # max 129 annotations per image
+train_ds = torch_dataset_to_xr_dataset(train_dataset)  # max 136 annotations per image
+
+
+# %%
+# check data arrays are the same but with annotations in different order
+# There is no guarantee that annotation with id=15 is the same in the
+# xr dataset computed from the annotations file and the one computed from
+# the torch dataset.
+for idx in range(len(val_ds.image_id.values)):
+    idcs_sorted_x1 = np.lexsort(
+        (
+            gt_bboxes_val_ds.position.values[idx, 1, :],
+            gt_bboxes_val_ds.position.values[idx, 0, :],
+        )
+    )  # sort by x, then y
+    idcs_sorted_x2 = np.lexsort(
+        (
+            val_ds.position.values[idx, 1, :],
+            val_ds.position.values[idx, 0, :],
+        )
+    )  # sort by x, then y
+    assert np.allclose(
+        gt_bboxes_val_ds.position.values[idx, :, idcs_sorted_x1],
+        val_ds.position.values[idx, :, idcs_sorted_x2],
+        equal_nan=True,
+        rtol=1e-5,
+        atol=1e-8,
+    )
 
-assert gt_bboxes_val_ds.equals(val_ds)
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Evaluate

From f21b2a56bec345d0cda84923ac357285de23400d Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 30 Jul 2025 16:12:01 +0100
Subject: [PATCH 53/72] Change variable names

---
 ethology/detectors/evaluate.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index f03f4403..e59ffbbe 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -205,29 +205,29 @@ def evaluate_detections_hungarian_arrays(
 
 
 def compute_precision_recall_ds(
-    fused_detections_ds: xr.Dataset,
+    pred_bboxes_ds: xr.Dataset,
     gt_bboxes_ds: xr.Dataset,
     iou_threshold: float,
 ) -> tuple[xr.Dataset, xr.Dataset]:
     """Compute precision and recall per image."""
     # Compute true positives, false positives, and missed detections
-    fused_detections_ds, gt_bboxes_ds = evaluate_detections_hungarian_ds(
-        pred_bboxes_ds=fused_detections_ds,
+    pred_bboxes_ds, gt_bboxes_ds = evaluate_detections_hungarian_ds(
+        pred_bboxes_ds=pred_bboxes_ds,
         gt_bboxes_ds=gt_bboxes_ds,
         iou_threshold=iou_threshold,
     )
 
     # Compute precision and recall per image
-    precision_per_img = fused_detections_ds.tp.sum(dim="id") / (
-        fused_detections_ds.tp.sum(dim="id")
-        + fused_detections_ds.fp.sum(dim="id")
+    precision_per_img = pred_bboxes_ds.tp.sum(dim="id") / (
+        pred_bboxes_ds.tp.sum(dim="id")
+        + pred_bboxes_ds.fp.sum(dim="id")
     )
-    recall_per_img = fused_detections_ds.tp.sum(dim="id") / (
-        fused_detections_ds.tp.sum(dim="id") + gt_bboxes_ds.md.sum(dim="id")
+    recall_per_img = pred_bboxes_ds.tp.sum(dim="id") / (
+        pred_bboxes_ds.tp.sum(dim="id") + gt_bboxes_ds.md.sum(dim="id")
     )
 
     # Add to datasets
-    fused_detections_ds["precision"] = precision_per_img
-    fused_detections_ds["recall"] = recall_per_img
+    pred_bboxes_ds["precision"] = precision_per_img
+    pred_bboxes_ds["recall"] = recall_per_img
 
-    return fused_detections_ds, gt_bboxes_ds
+    return pred_bboxes_ds, gt_bboxes_ds

From 18ce9a76208a73146b4fd916e67da616c48984c9 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 30 Jul 2025 17:34:17 +0100
Subject: [PATCH 54/72] Run on OOD data

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 168 ++++++++++--------
 1 file changed, 98 insertions(+), 70 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 86812c31..cd4482be 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -30,6 +30,7 @@
     read_config_from_mlflow_params,
     read_mlflow_params,
 )
+import matplotlib.pyplot as plt
 
 # Set xarray options
 xr.set_options(display_expand_attrs=False)
@@ -37,11 +38,13 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Input data
 
-dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
-annotations_file_path = (
-    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/aug2023-full")
+# Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path(
+    "/home/sminano/swc/project_crabs/data/aug2023-full/annotations"
 )
+# Path("/home/sminano/swc/project_ethology/large_annotations")
+annotations_file_path = annotations_dir / "VIA_JSON_combined_coco_gen.json"
 
 experiment_ID = "617393114420881798"
 ml_runs_experiment_dir = (
@@ -113,6 +116,7 @@ def plot_and_save_ensemble_detections(
     output_dir,
     precision,
     recall,
+    extra_str="",
 ):
     """Plot ground truth and ensemble detections on image and save as PNG."""
     # Convert tensor to numpy array and transpose from (C, H, W) to (H, W, C)
@@ -186,7 +190,8 @@ def plot_and_save_ensemble_detections(
     )
 
     # Save the image as PNG
-    output_filename = output_dir / f"val_set_{image_id:06d}.png"
+    extra_str = f"{extra_str}_" if extra_str else ""
+    output_filename = output_dir / f"val_set_{extra_str}{image_id:06d}.png"
     cv2.imwrite(str(output_filename), image_cv)
     print(f"Saved: {output_filename}")
 
@@ -255,7 +260,10 @@ def plot_and_save_ensemble_detections(
 train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
     dataset_coco,
     seed_n=ref_cli_args["seed_n"],
-    config=ref_config,  # only uses train_fraction and val_over_test_fraction
+    config={
+        "train_fraction": 0.0,
+        "val_over_test_fraction": 1.0,
+    },  # only uses train_fraction and val_over_test_fraction
 )
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -314,7 +322,7 @@ def plot_and_save_ensemble_detections(
 print(annotations_file_path.name)
 gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
-# fix category ID
+# fix category ID (to be fixed in loader)
 gt_bboxes_ds["category"] = gt_bboxes_ds["category"].where(
     gt_bboxes_ds["category"] != 0, 1
 )
@@ -323,51 +331,40 @@ def plot_and_save_ensemble_detections(
 # Note that the max number of annotations per image in the val_dataset
 # will stay as in the original dataset (also, category = -1 is not considered
 # an empty value for xarrays .dropna())
-list_image_ids_val = [annot["image_id"] for img, annot in val_dataset] 
+list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
 gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
 
-# # %%
-# list_image_ids_test = [annot["image_id"] for img, annot in test_dataset]
-# list_image_ids_train = [annot["image_id"] for img, annot in train_dataset]
-
-# gt_bboxes_test_ds = gt_bboxes_ds.sel(image_id=list_image_ids_test)
-# gt_bboxes_train_ds = gt_bboxes_ds.sel(image_id=list_image_ids_train)
-
 # %%
 # Alternatively: convert torch dataset into xarray detections dataset
 # .....
+val_ds = torch_dataset_to_xr_dataset(val_dataset)
 
-val_ds = torch_dataset_to_xr_dataset(val_dataset)  # max -- annotations per image
-test_ds = torch_dataset_to_xr_dataset(test_dataset)  # max 129 annotations per image
-train_ds = torch_dataset_to_xr_dataset(train_dataset)  # max 136 annotations per image
-
-
-# %%
-# check data arrays are the same but with annotations in different order
-# There is no guarantee that annotation with id=15 is the same in the
-# xr dataset computed from the annotations file and the one computed from
-# the torch dataset.
-for idx in range(len(val_ds.image_id.values)):
-    idcs_sorted_x1 = np.lexsort(
-        (
-            gt_bboxes_val_ds.position.values[idx, 1, :],
-            gt_bboxes_val_ds.position.values[idx, 0, :],
-        )
-    )  # sort by x, then y
-    idcs_sorted_x2 = np.lexsort(
-        (
-            val_ds.position.values[idx, 1, :],
-            val_ds.position.values[idx, 0, :],
-        )
-    )  # sort by x, then y
-    assert np.allclose(
-        gt_bboxes_val_ds.position.values[idx, :, idcs_sorted_x1],
-        val_ds.position.values[idx, :, idcs_sorted_x2],
-        equal_nan=True,
-        rtol=1e-5,
-        atol=1e-8,
-    )
+# # %%
+# # check data arrays are the same but with annotations in different order
+# # There is no guarantee that annotation with id=15 is the same in the
+# # xr dataset computed from the annotations file and the one computed from
+# # the torch dataset.
+# for idx in range(len(val_ds.image_id.values)):
+#     idcs_sorted_x1 = np.lexsort(
+#         (
+#             gt_bboxes_val_ds.position.values[idx, 1, :],
+#             gt_bboxes_val_ds.position.values[idx, 0, :],
+#         )
+#     )  # sort by x, then y
+#     idcs_sorted_x2 = np.lexsort(
+#         (
+#             val_ds.position.values[idx, 1, :],
+#             val_ds.position.values[idx, 0, :],
+#         )
+#     )  # sort by x, then y
+#     assert np.allclose(
+#         gt_bboxes_val_ds.position.values[idx, :, idcs_sorted_x1],
+#         val_ds.position.values[idx, :, idcs_sorted_x2],
+#         equal_nan=True,
+#         rtol=1e-5,
+#         atol=1e-8,
+#     )
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -380,32 +377,63 @@ def plot_and_save_ensemble_detections(
 )
 
 
+# %%
+print(f"Precision: {fused_detections_ds.precision.mean().values:.4f}")
+print(f"Recall: {fused_detections_ds.recall.mean().values:.4f}")
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # plot ensemble detections on a selected image
 
-# Get first image
-image_index = 0
-image = val_dataset[image_index][0]
-gt_annotations = val_dataset[image_index][1]
-
-# fused_detections_ds_plot = add_bboxes_min_max_corners(fused_detections_ds)
+# idcs_low_precision = np.argwhere(fused_detections_ds.precision.data < 0.5)
+# idcs_high_precision = np.argwhere(fused_detections_ds.precision.data > 0.9)
+idcs_imgs_increasing_precision = np.argsort(fused_detections_ds.precision.data)
 
-plot_and_save_ensemble_detections(
-    image=image,
-    gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
-    pred_boxes_x1_y1_x2_y2=np.hstack(
-        [
-            fused_detections_ds[xy_corner_str]
-            .isel(image_id=image_index)
-            .values.T
-            for xy_corner_str in ["xy_min", "xy_max"]
-        ]
-    ),
-    pred_boxes_scores=fused_detections_ds.isel(
-        image_id=image_index
-    ).confidence.values,
-    image_id=gt_annotations["image_id"],
-    output_dir=Path.cwd(),
-    precision=fused_detections_ds.isel(image_id=image_index).precision.values,
-    recall=fused_detections_ds.isel(image_id=image_index).recall.values,
-)
+# Get first image
+for i in list(range(0, len(idcs_imgs_increasing_precision), 50)) + [
+    len(idcs_imgs_increasing_precision) - 1
+]:
+    image_index = idcs_imgs_increasing_precision[i].item()
+    image, gt_annotations = val_dataset[image_index]
+
+    plot_and_save_ensemble_detections(
+        image=image,
+        gt_boxes_x1_y1_x2_y2=gt_annotations["boxes"],
+        pred_boxes_x1_y1_x2_y2=np.hstack(
+            [
+                fused_detections_ds[xy_corner_str]
+                .isel(image_id=image_index)
+                .values.T
+                for xy_corner_str in ["xy_min", "xy_max"]
+            ]
+        ),
+        pred_boxes_scores=fused_detections_ds.isel(
+            image_id=image_index
+        ).confidence.values,
+        image_id=gt_annotations["image_id"],
+        output_dir=Path(
+            "/home/sminano/swc/project_ethology/aug2023-ood-ensemble"
+        ),
+        extra_str=f"{i:03d}",
+        precision=fused_detections_ds.isel(
+            image_id=image_index
+        ).precision.values,
+        recall=fused_detections_ds.isel(image_id=image_index).recall.values,
+    )
+    print(f"image id: {gt_annotations['image_id']}")
+# %%
+%matplotlib widget
+# %%
+fig, ax = plt.subplots()
+ax.hist(fused_detections_ds.precision.values)
+ax.set_xlabel("Precision per frame")
+ax.set_ylabel("count (frames)")
+ax.set_title(f"Precision OOD (n={fused_detections_ds.sizes['image_id']})")
+
+fig, ax = plt.subplots()
+ax.hist(fused_detections_ds.recall.values)
+ax.set_xlabel("Recall per frame")
+ax.set_ylabel("count (frames)")
+ax.set_title(f"Recall OOD (n={fused_detections_ds.sizes['image_id']})")
+
+# plt.show()
+# %%

From 2161a5983a44596ba7c3451cdf0c296c5b76f0aa Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 31 Jul 2025 12:43:02 +0100
Subject: [PATCH 55/72] Attempt to generalise ensemble WIP

---
 ethology/detectors/ensembles.py               | 140 ++++++-
 .../notebook_run_ensemble_on_eval_dataset.py  | 346 +++++++++++++++++-
 2 files changed, 469 insertions(+), 17 deletions(-)

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
index 5dc1ee8c..0eb5d3d9 100644
--- a/ethology/detectors/ensembles.py
+++ b/ethology/detectors/ensembles.py
@@ -2,13 +2,114 @@
 
 import numpy as np
 import xarray as xr
-from ensemble_boxes import weighted_boxes_fusion
+from ensemble_boxes import soft_nms, weighted_boxes_fusion
 
 from ethology.detectors.utils import (
     detections_x1y1_x2y2_as_da_tuple,
 )
 
 
+def soft_nms_wrapper_arrays(
+    bboxes_x1y1: np.ndarray,
+    bboxes_x2y2: np.ndarray,  # model, annot, 4
+    confidence: np.ndarray,  # model, annot
+    label: np.ndarray,  # model, annot
+    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
+    iou_thr_ensemble: float = 0.5,
+    skip_box_thr: float = 0.0001,
+    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+    """Wrap weighted boxes fusion to receive arrays as input.
+
+    Parameters
+    ----------
+    bboxes_x1y1: np.ndarray
+        Detected bounding boxes in a single imagein x1y1 format, with shape
+        n_models, n_annotations, 2.
+    bboxes_x2y2: np.ndarray
+        Detected bounding boxes in a single image in x2y2 format, with shape
+        n_models, n_annotations, 2.
+    confidence: np.ndarray
+        Confidence scores for each bounding box, with shape
+        n_models, n_annotations.
+    label: np.ndarray
+        Labels for each bounding box, with shape n_models, n_annotations.
+    image_width_height: np.ndarray
+        Width and height of the image, with shape 2.
+    iou_thr_ensemble: float
+        IoU threshold for detections to be considered for fusion.
+    skip_box_thr: float
+        Threshold for skipping boxes with confidence below this value.
+    max_n_detections: int
+        Fused bounding boxes arrays are padded to this total number of boxes.
+        Its value should be larger than the expected maximum number of detections
+        per image after fusing across models.
+
+    Returns
+    -------
+    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
+        Tuple of xr.DataArrays containing the fused detections. The arrays
+        are padded to max_n_detections and contain the data for the centroid,
+        shape, confidence and label of the fused detections.
+
+    """
+    # Prepare bboxes for WBF
+    bboxes_x1y1_x2y2_normalised = (
+        np.concat([bboxes_x1y1, bboxes_x2y2], axis=-1)
+        / np.tile(image_width_height, (1, 2))
+    ) #[:, :, :, None]
+
+    # ------------------------------------
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = soft_nms(
+        bboxes_x1y1_x2y2_normalised,
+        confidence,
+        label,
+        iou_thr=iou_thr_ensemble,
+        thresh=skip_box_thr,  # threshold for boxes to keep
+        method=2,  # 1 - linear soft-NMS, 2 - gaussian soft-NMS, 3 - standard NMS
+        sigma=0.5,  # sigma for gaussian soft-NMS
+    )
+
+    # ------------------------------------
+    # Undo x1y1 x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
+    )
+
+    # Combine x1y1, x2y2, scores and labels in one array
+    ensemble_x1y2_x2y2_scores_labels = np.c_[
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    ]
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ~slc_nan_rows
+    ]
+
+    # Pad combined array to max_n_detections
+    # (this is required to concatenate across image_ids
+    ensemble_x1y2_x2y2_scores_labels = np.pad(
+        ensemble_x1y2_x2y2_scores_labels,
+        (
+            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+            (0, 0),
+        ),
+        "constant",
+        constant_values=np.nan,
+    )
+
+    # Format output as xarray dataarrays
+    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+        ensemble_x1y2_x2y2_scores_labels[:, 4],
+        ensemble_x1y2_x2y2_scores_labels[:, 5],
+    )
+
+    return centroid, shape, confidence, label
+
+
 def wbf_wrapper_arrays(
     bboxes_x1y1: np.ndarray,
     bboxes_x2y2: np.ndarray,  # model, annot, 4
@@ -57,15 +158,42 @@ def wbf_wrapper_arrays(
     bboxes_x1y1_x2y2_normalised = (
         np.concat([bboxes_x1y1, bboxes_x2y2], axis=-1)
         / np.tile(image_width_height, (1, 2))
-    )[:, :, :, None]
+    ) #[:, :, :, None]
 
+    # Remove rows with nan coordinates
+    n_models = bboxes_x1y1_x2y2_normalised.shape[0]
+    list_bboxes_per_model = [
+        arr.squeeze() for arr in np.split(
+            bboxes_x1y1_x2y2_normalised, n_models, axis=0
+        )
+    ]
+    list_bboxes_per_model = [
+        arr[~np.any(np.isnan(arr), axis=1), :]
+        for arr in list_bboxes_per_model
+    ]
+    list_confidence_per_model = [
+        conf_arr.squeeze()[:bbox_arr.shape[0]]
+        for bbox_arr, conf_arr in zip(
+            list_bboxes_per_model,
+            np.split(confidence, n_models, axis=0),
+            strict=True,
+        )
+    ]
+    list_label_per_model = [
+        label_arr.squeeze()[:bbox_arr.shape[0]]
+        for bbox_arr, label_arr in zip(
+            list_bboxes_per_model,
+            np.split(label, n_models, axis=0),
+            strict=True,
+        )
+    ]
     # ------------------------------------
     # Run WBF
     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
         weighted_boxes_fusion(
-            bboxes_x1y1_x2y2_normalised,
-            confidence,
-            label,
+            list_bboxes_per_model,
+            list_confidence_per_model,
+            list_label_per_model,
             iou_thr=iou_thr_ensemble,
             skip_box_thr=skip_box_thr,
         )
@@ -147,7 +275,7 @@ def combine_detections_across_models_wbf(
     # Run WBF vectorized
     centroid_fused, shape_fused, confidence_fused, label_fused = (
         xr.apply_ufunc(
-            wbf_wrapper_arrays,
+            wbf_wrapper_arrays,  # ------------#wbf_wrapper_arrays,
             all_models_detections_ds.xy_min,  # the underlaying .data array is passed
             all_models_detections_ds.xy_max,
             all_models_detections_ds.confidence,
diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index cd4482be..80130fb5 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import cv2
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import torch
@@ -13,7 +14,10 @@
 from ethology.annotations.io import load_bboxes
 from ethology.datasets.convert import torch_dataset_to_xr_dataset
 from ethology.datasets.create import create_coco_dataset
-from ethology.detectors.ensembles import combine_detections_across_models_wbf
+from ethology.detectors.ensembles import (
+    combine_detections_across_models_wbf,
+    wbf_wrapper_arrays,
+)
 from ethology.detectors.evaluate import compute_precision_recall_ds
 from ethology.detectors.inference import (
     collate_fn_varying_n_bboxes,
@@ -24,13 +28,13 @@
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
 from ethology.detectors.utils import (
     add_bboxes_min_max_corners,
+    detections_x1y1_x2y2_as_da_tuple,
 )
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
     read_config_from_mlflow_params,
     read_mlflow_params,
 )
-import matplotlib.pyplot as plt
 
 # Set xarray options
 xr.set_options(display_expand_attrs=False)
@@ -43,7 +47,6 @@
 annotations_dir = Path(
     "/home/sminano/swc/project_crabs/data/aug2023-full/annotations"
 )
-# Path("/home/sminano/swc/project_ethology/large_annotations")
 annotations_file_path = annotations_dir / "VIA_JSON_combined_coco_gen.json"
 
 experiment_ID = "617393114420881798"
@@ -53,7 +56,7 @@
 
 # I pick seed 42 for each set of models
 models_dict = {
-    # "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
     "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
     "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
     "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
@@ -214,6 +217,11 @@ def plot_and_save_ensemble_detections(
     config = read_config_from_mlflow_params(mlflow_params)
     cli_args = read_cli_args_from_mlflow_params(mlflow_params)
 
+    print(
+        f"Run name: {mlflow_params['run_name']}, trained on "
+        f"{Path(cli_args['dataset_dirs'][0]).name}, "
+        f"{Path(cli_args['annotation_files'][0]).name}"
+    )
     # ------------------------------------
     # Load model
     model = load_fasterrcnn_resnet50_fpn_v2(
@@ -260,12 +268,15 @@ def plot_and_save_ensemble_detections(
 train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
     dataset_coco,
     seed_n=ref_cli_args["seed_n"],
+    # config=ref_config,
     config={
         "train_fraction": 0.0,
         "val_over_test_fraction": 1.0,
     },  # only uses train_fraction and val_over_test_fraction
 )
 
+print(annotations_file_path)
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define val dataloader
 # shuffle=False so that we dont shuffle the data
@@ -306,7 +317,7 @@ def plot_and_save_ensemble_detections(
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Fuse detections across models
-fused_detections_ds = combine_detections_across_models_wbf(
+fused_detections_ds = combine_detections_across_models_wbf(  # ------soft_nms?
     all_models_detections_ds,
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
@@ -315,11 +326,296 @@ def plot_and_save_ensemble_detections(
     },
 )
 
+# %%
+from ensemble_boxes import weighted_boxes_fusion
+
+
+def wbf_wrapper_arrays_2(
+    bboxes_x1y1: np.ndarray,
+    bboxes_x2y2: np.ndarray,  # model, annot, 4
+    confidence: np.ndarray,  # model, annot
+    label: np.ndarray,  # model, annot
+    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
+    iou_thr_ensemble: float = 0.5,
+    skip_box_thr: float = 0.0001,
+    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+    """Wrap weighted boxes fusion to receive arrays as input.
+
+    Parameters
+    ----------
+    bboxes_x1y1: np.ndarray
+        Detected bounding boxes in a single imagein x1y1 format, with shape
+        n_models, n_annotations, 2.
+    bboxes_x2y2: np.ndarray
+        Detected bounding boxes in a single image in x2y2 format, with shape
+        n_models, n_annotations, 2.
+    confidence: np.ndarray
+        Confidence scores for each bounding box, with shape
+        n_models, n_annotations.
+    label: np.ndarray
+        Labels for each bounding box, with shape n_models, n_annotations.
+    image_width_height: np.ndarray
+        Width and height of the image, with shape 2.
+    iou_thr_ensemble: float
+        IoU threshold for detections to be considered for fusion.
+    skip_box_thr: float
+        Threshold for skipping boxes with confidence below this value.
+    max_n_detections: int
+        Fused bounding boxes arrays are padded to this total number of boxes.
+        Its value should be larger than the expected maximum number of detections
+        per image after fusing across models.
+
+    Returns
+    -------
+    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
+        Tuple of xr.DataArrays containing the fused detections. The arrays
+        are padded to max_n_detections and contain the data for the centroid,
+        shape, confidence and label of the fused detections.
+
+    """
+    # Prepare bboxes for WBF
+    bboxes_x1y1_x2y2_normalised = np.concat(
+        [bboxes_x1y1, bboxes_x2y2], axis=-1
+    ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
+
+    # ------------------------------------
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
+        weighted_boxes_fusion(
+            bboxes_x1y1_x2y2_normalised,
+            confidence,
+            label,
+            iou_thr=iou_thr_ensemble,
+            skip_box_thr=skip_box_thr,
+        )
+    )
+
+    # ------------------------------------
+    # Undo x1y1 x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
+    )
+
+    # Combine x1y1, x2y2, scores and labels in one array
+    ensemble_x1y2_x2y2_scores_labels = np.c_[
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    ]
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ~slc_nan_rows
+    ]
+
+    # Pad combined array to max_n_detections
+    # (this is required to concatenate across image_ids
+    ensemble_x1y2_x2y2_scores_labels = np.pad(
+        ensemble_x1y2_x2y2_scores_labels,
+        (
+            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+            (0, 0),
+        ),
+        "constant",
+        constant_values=np.nan,
+    )
+
+    # Format output as xarray dataarrays
+    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+        ensemble_x1y2_x2y2_scores_labels[:, 4],
+        ensemble_x1y2_x2y2_scores_labels[:, 5],
+    )
+
+    return centroid, shape, confidence, label
+
+
+# %%
+from ensemble_boxes import soft_nms
+
+
+def soft_nms_wrapper_arrays_2(
+    bboxes_x1y1: np.ndarray,
+    bboxes_x2y2: np.ndarray,  # model, annot, 4
+    confidence: np.ndarray,  # model, annot
+    label: np.ndarray,  # model, annot
+    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
+    iou_thr_ensemble: float = 0.5,
+    skip_box_thr: float = 0.0001,
+    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
+) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+    """Wrap weighted boxes fusion to receive arrays as input.
+
+    Parameters
+    ----------
+    bboxes_x1y1: np.ndarray
+        Detected bounding boxes in a single imagein x1y1 format, with shape
+        n_models, n_annotations, 2.
+    bboxes_x2y2: np.ndarray
+        Detected bounding boxes in a single image in x2y2 format, with shape
+        n_models, n_annotations, 2.
+    confidence: np.ndarray
+        Confidence scores for each bounding box, with shape
+        n_models, n_annotations.
+    label: np.ndarray
+        Labels for each bounding box, with shape n_models, n_annotations.
+    image_width_height: np.ndarray
+        Width and height of the image, with shape 2.
+    iou_thr_ensemble: float
+        IoU threshold for detections to be considered for fusion.
+    skip_box_thr: float
+        Threshold for skipping boxes with confidence below this value.
+    max_n_detections: int
+        Fused bounding boxes arrays are padded to this total number of boxes.
+        Its value should be larger than the expected maximum number of detections
+        per image after fusing across models.
+
+    Returns
+    -------
+    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
+        Tuple of xr.DataArrays containing the fused detections. The arrays
+        are padded to max_n_detections and contain the data for the centroid,
+        shape, confidence and label of the fused detections.
+
+    """
+    # Prepare bboxes for WBF
+    bboxes_x1y1_x2y2_normalised = np.concat(
+        [bboxes_x1y1, bboxes_x2y2], axis=-1
+    ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
+
+    # Remove rows with nan coordinates
+    n_models = bboxes_x1y1_x2y2_normalised.shape[0]
+    list_bboxes_per_model = [
+        arr.squeeze() for arr in np.split(
+            bboxes_x1y1_x2y2_normalised, n_models, axis=0
+        )
+    ]
+    list_bboxes_per_model = [
+        arr[~np.any(np.isnan(arr), axis=1), :]
+        for arr in list_bboxes_per_model
+    ]
+    list_confidence_per_model = [
+        conf_arr.squeeze()[:bbox_arr.shape[0]]
+        for bbox_arr, conf_arr in zip(
+            list_bboxes_per_model,
+            np.split(confidence, n_models, axis=0),
+            strict=True,
+        )
+    ]
+    list_label_per_model = [
+        label_arr.squeeze()[:bbox_arr.shape[0]]
+        for bbox_arr, label_arr in zip(
+            list_bboxes_per_model,
+            np.split(label, n_models, axis=0),
+            strict=True,
+        )
+    ]
+    # list_label_per_model = [
+    #     label_arr[bbox_arr.shape[0], :]
+    #     for bbox_arr, label_arr in zip(
+    #         list_bboxes_per_model, np.split(label, n_models, axis=0)
+    #     )
+    # ]
+
+    # ------------------------------------
+    # Run WBF
+    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = soft_nms(
+        #bboxes_x1y1_x2y2_normalised,
+        list_bboxes_per_model,
+        list_confidence_per_model,
+        list_label_per_model,
+        iou_thr=iou_thr_ensemble,
+        thresh=skip_box_thr,  # threshold for boxes to keep
+        method=3,  # 1 - linear soft-NMS, 2 - gaussian soft-NMS, 3 - standard NMS
+        sigma=0.5,  # sigma for gaussian soft-NMS
+    )
+
+    # ------------------------------------
+    # Undo x1y1 x2y2 normalization
+    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+        image_width_height, (1, 2)
+    )
+
+    # Combine x1y1, x2y2, scores and labels in one array
+    ensemble_x1y2_x2y2_scores_labels = np.c_[
+        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+    ]
+
+    # Remove rows with nan coordinates
+    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ~slc_nan_rows
+    ]
+
+    # Pad combined array to max_n_detections
+    # (this is required to concatenate across image_ids
+    ensemble_x1y2_x2y2_scores_labels = np.pad(
+        ensemble_x1y2_x2y2_scores_labels,
+        (
+            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+            (0, 0),
+        ),
+        "constant",
+        constant_values=np.nan,
+    )
+
+    # Format output as xarray dataarrays
+    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+        ensemble_x1y2_x2y2_scores_labels[:, 4],
+        ensemble_x1y2_x2y2_scores_labels[:, 5],
+    )
+
+    return centroid, shape, confidence, label
+
+
+# %%
+# Prepare kwargs
+kwargs_wbf = {
+    "iou_thr_ensemble": 0.5,
+    "skip_box_thr": 0.0001,
+    "max_n_detections": 500,  # set default?
+}
+kwargs_wbf["image_width_height"] = np.array(
+    [
+        all_models_detections_ds.attrs[img_size]
+        for img_size in ["image_width", "image_height"]
+    ]
+)
+
+# Run WBF vectorized
+centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
+    soft_nms_wrapper_arrays_2,  # ------------#wbf_wrapper_arrays,
+    all_models_detections_ds.xy_min,  # the underlaying .data array is passed
+    all_models_detections_ds.xy_max,
+    all_models_detections_ds.confidence,
+    all_models_detections_ds.label,
+    kwargs=kwargs_wbf,
+    input_core_dims=[  # do not broadcast across these
+        ["model", "id", "space"],
+        ["model", "id", "space"],
+        ["model", "id"],
+        ["model", "id"],
+    ],
+    output_core_dims=[
+        ["space", "id"],
+        ["space", "id"],
+        ["id"],
+        ["id"],
+    ],
+    vectorize=True,
+    # loop over non-core dims (i.e. image_id);
+    # assumes function only takes arrays over core dims as input
+    exclude_dims={"id"},
+    # to allow dimensions that change size btw input and output
+)
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define ground truth dataset
 
 # read annotations as a dataset
-print(annotations_file_path.name)
+print(annotations_file_path)
 gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
 # fix category ID (to be fixed in loader)
@@ -338,6 +634,7 @@ def plot_and_save_ensemble_detections(
 # %%
 # Alternatively: convert torch dataset into xarray detections dataset
 # .....
+# is it faster?
 val_ds = torch_dataset_to_xr_dataset(val_dataset)
 
 # # %%
@@ -370,26 +667,45 @@ def plot_and_save_ensemble_detections(
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Evaluate
 
+# ensemble model
 fused_detections_ds, gt_bboxes_val_ds = compute_precision_recall_ds(
     pred_bboxes_ds=fused_detections_ds,
     gt_bboxes_ds=gt_bboxes_val_ds,
     iou_threshold=0.1,  # change to 0.5?
 )
 
-
-# %%
 print(f"Precision: {fused_detections_ds.precision.mean().values:.4f}")
 print(f"Recall: {fused_detections_ds.recall.mean().values:.4f}")
 
+# %%
+# single models
+list_detections_ds_eval = []
+for k, ds in enumerate(list_detections_ds):
+    detections_ds, _ = compute_precision_recall_ds(
+        pred_bboxes_ds=ds,
+        gt_bboxes_ds=gt_bboxes_val_ds,
+        iou_threshold=0.1,  # change to 0.5?
+    )
+    list_detections_ds_eval.append(detections_ds)
+
+    print(f"Model: {k}")
+    print(f"Precision: {detections_ds.precision.mean().values:.4f}")
+    print(f"Recall: {detections_ds.recall.mean().values:.4f}")
+    print("--------------------------------")
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # plot ensemble detections on a selected image
 
 # idcs_low_precision = np.argwhere(fused_detections_ds.precision.data < 0.5)
 # idcs_high_precision = np.argwhere(fused_detections_ds.precision.data > 0.9)
+
+fused_detections_ds = detections_ds
 idcs_imgs_increasing_precision = np.argsort(fused_detections_ds.precision.data)
+step = 5  # 50
+
 
 # Get first image
-for i in list(range(0, len(idcs_imgs_increasing_precision), 50)) + [
+for i in list(range(0, len(idcs_imgs_increasing_precision), step)) + [
     len(idcs_imgs_increasing_precision) - 1
 ]:
     image_index = idcs_imgs_increasing_precision[i].item()
@@ -411,7 +727,7 @@ def plot_and_save_ensemble_detections(
         ).confidence.values,
         image_id=gt_annotations["image_id"],
         output_dir=Path(
-            "/home/sminano/swc/project_ethology/aug2023-ood-ensemble"
+            "/home/sminano/swc/project_ethology/0th-percentile-ood-aug2023"
         ),
         extra_str=f"{i:03d}",
         precision=fused_detections_ds.isel(
@@ -421,16 +737,24 @@ def plot_and_save_ensemble_detections(
     )
     print(f"image id: {gt_annotations['image_id']}")
 # %%
-%matplotlib widget
+# %matplotlib widget
 # %%
 fig, ax = plt.subplots()
 ax.hist(fused_detections_ds.precision.values)
+ax.axvline(
+    fused_detections_ds.precision.values.mean(), color="red", linestyle="--"
+)
+ax.set_xlim(0, 1)
 ax.set_xlabel("Precision per frame")
 ax.set_ylabel("count (frames)")
 ax.set_title(f"Precision OOD (n={fused_detections_ds.sizes['image_id']})")
 
 fig, ax = plt.subplots()
 ax.hist(fused_detections_ds.recall.values)
+ax.axvline(
+    fused_detections_ds.recall.values.mean(), color="red", linestyle="--"
+)
+ax.set_xlim(0, 1)
 ax.set_xlabel("Recall per frame")
 ax.set_ylabel("count (frames)")
 ax.set_title(f"Recall OOD (n={fused_detections_ds.sizes['image_id']})")

From 31978a3ba7f901017b39b06bd4e4a6e9884da035 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 31 Jul 2025 17:30:07 +0100
Subject: [PATCH 56/72] Remove fused detections with confidence below th

---
 ethology/detectors/ensembles.py | 246 +++++++++++++++++---------------
 1 file changed, 127 insertions(+), 119 deletions(-)

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
index 0eb5d3d9..4434fa35 100644
--- a/ethology/detectors/ensembles.py
+++ b/ethology/detectors/ensembles.py
@@ -2,112 +2,111 @@
 
 import numpy as np
 import xarray as xr
-from ensemble_boxes import soft_nms, weighted_boxes_fusion
+from ensemble_boxes import weighted_boxes_fusion
 
 from ethology.detectors.utils import (
     detections_x1y1_x2y2_as_da_tuple,
 )
 
+# def soft_nms_wrapper_arrays(
+#     bboxes_x1y1: np.ndarray,
+#     bboxes_x2y2: np.ndarray,  # model, annot, 4
+#     confidence: np.ndarray,  # model, annot
+#     label: np.ndarray,  # model, annot
+#     image_width_height: np.ndarray,  # = np.array([4096, 2160]),
+#     iou_thr_ensemble: float = 0.5,
+#     skip_box_thr: float = 0.0001,
+#     max_n_detections: int = 300,
+# ) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
+#     """Wrap weighted boxes fusion to receive arrays as input.
+
+#     Parameters
+#     ----------
+#     bboxes_x1y1: np.ndarray
+#         Detected bounding boxes in a single imagein x1y1 format, with shape
+#         n_models, n_annotations, 2.
+#     bboxes_x2y2: np.ndarray
+#         Detected bounding boxes in a single image in x2y2 format, with shape
+#         n_models, n_annotations, 2.
+#     confidence: np.ndarray
+#         Confidence scores for each bounding box, with shape
+#         n_models, n_annotations.
+#     label: np.ndarray
+#         Labels for each bounding box, with shape n_models, n_annotations.
+#     image_width_height: np.ndarray
+#         Width and height of the image, with shape 2.
+#     iou_thr_ensemble: float
+#         IoU threshold for detections to be considered for fusion.
+#     skip_box_thr: float
+#         Threshold for skipping boxes with confidence below this value.
+#     max_n_detections: int
+#         Fused bounding boxes arrays are padded to this total number of boxes.
+#         Its value should be larger than the expected maximum number of
+#         detections per image after fusing across models.
+
+#     Returns
+#     -------
+#     tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
+#         Tuple of xr.DataArrays containing the fused detections. The arrays
+#         are padded to max_n_detections and contain the data for the centroid,
+#         shape, confidence and label of the fused detections.
+
+#     """
+#     # Prepare bboxes for WBF
+#     bboxes_x1y1_x2y2_normalised = np.concat(
+#         [bboxes_x1y1, bboxes_x2y2], axis=-1
+#     ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
+
+#     # ------------------------------------
+#     # Run WBF
+#     ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = soft_nms(
+#         bboxes_x1y1_x2y2_normalised,
+#         confidence,
+#         label,
+#         iou_thr=iou_thr_ensemble,
+#         thresh=skip_box_thr,  # threshold for boxes to keep
+#         method=2,  # 1-linear soft-NMS, 2-gaussian soft-NMS, 3-standard NMS
+#         sigma=0.5,  # sigma for gaussian soft-NMS
+#     )
 
-def soft_nms_wrapper_arrays(
-    bboxes_x1y1: np.ndarray,
-    bboxes_x2y2: np.ndarray,  # model, annot, 4
-    confidence: np.ndarray,  # model, annot
-    label: np.ndarray,  # model, annot
-    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
-    iou_thr_ensemble: float = 0.5,
-    skip_box_thr: float = 0.0001,
-    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
-) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
-    """Wrap weighted boxes fusion to receive arrays as input.
-
-    Parameters
-    ----------
-    bboxes_x1y1: np.ndarray
-        Detected bounding boxes in a single imagein x1y1 format, with shape
-        n_models, n_annotations, 2.
-    bboxes_x2y2: np.ndarray
-        Detected bounding boxes in a single image in x2y2 format, with shape
-        n_models, n_annotations, 2.
-    confidence: np.ndarray
-        Confidence scores for each bounding box, with shape
-        n_models, n_annotations.
-    label: np.ndarray
-        Labels for each bounding box, with shape n_models, n_annotations.
-    image_width_height: np.ndarray
-        Width and height of the image, with shape 2.
-    iou_thr_ensemble: float
-        IoU threshold for detections to be considered for fusion.
-    skip_box_thr: float
-        Threshold for skipping boxes with confidence below this value.
-    max_n_detections: int
-        Fused bounding boxes arrays are padded to this total number of boxes.
-        Its value should be larger than the expected maximum number of detections
-        per image after fusing across models.
-
-    Returns
-    -------
-    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
-        Tuple of xr.DataArrays containing the fused detections. The arrays
-        are padded to max_n_detections and contain the data for the centroid,
-        shape, confidence and label of the fused detections.
-
-    """
-    # Prepare bboxes for WBF
-    bboxes_x1y1_x2y2_normalised = (
-        np.concat([bboxes_x1y1, bboxes_x2y2], axis=-1)
-        / np.tile(image_width_height, (1, 2))
-    ) #[:, :, :, None]
-
-    # ------------------------------------
-    # Run WBF
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = soft_nms(
-        bboxes_x1y1_x2y2_normalised,
-        confidence,
-        label,
-        iou_thr=iou_thr_ensemble,
-        thresh=skip_box_thr,  # threshold for boxes to keep
-        method=2,  # 1 - linear soft-NMS, 2 - gaussian soft-NMS, 3 - standard NMS
-        sigma=0.5,  # sigma for gaussian soft-NMS
-    )
-
-    # ------------------------------------
-    # Undo x1y1 x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_width_height, (1, 2)
-    )
-
-    # Combine x1y1, x2y2, scores and labels in one array
-    ensemble_x1y2_x2y2_scores_labels = np.c_[
-        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
-    ]
-
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
-    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
-        ~slc_nan_rows
-    ]
+#     # ------------------------------------
+#     # Undo x1y1 x2y2 normalization
+#     ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
+#         image_width_height, (1, 2)
+#     )
 
-    # Pad combined array to max_n_detections
-    # (this is required to concatenate across image_ids
-    ensemble_x1y2_x2y2_scores_labels = np.pad(
-        ensemble_x1y2_x2y2_scores_labels,
-        (
-            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
-            (0, 0),
-        ),
-        "constant",
-        constant_values=np.nan,
-    )
+#     # Combine x1y1, x2y2, scores and labels in one array
+#     ensemble_x1y2_x2y2_scores_labels = np.c_[
+#         ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
+#     ]
+
+#     # Remove rows with nan coordinates
+#     slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+#     ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+#         ~slc_nan_rows
+#     ]
+
+#     # Pad combined array to max_n_detections
+#     # (this is required to concatenate across image_ids
+#     ensemble_x1y2_x2y2_scores_labels = np.pad(
+#         ensemble_x1y2_x2y2_scores_labels,
+#         (
+#             (0,
+# max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
+#             (0, 0),
+#         ),
+#         "constant",
+#         constant_values=np.nan,
+#     )
 
-    # Format output as xarray dataarrays
-    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
-        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
-        ensemble_x1y2_x2y2_scores_labels[:, 4],
-        ensemble_x1y2_x2y2_scores_labels[:, 5],
-    )
+#     # Format output as xarray dataarrays
+#     centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+#         ensemble_x1y2_x2y2_scores_labels[:, 0:4],
+#         ensemble_x1y2_x2y2_scores_labels[:, 4],
+#         ensemble_x1y2_x2y2_scores_labels[:, 5],
+#     )
 
-    return centroid, shape, confidence, label
+#     return centroid, shape, confidence, label
 
 
 def wbf_wrapper_arrays(
@@ -118,7 +117,8 @@ def wbf_wrapper_arrays(
     image_width_height: np.ndarray,  # = np.array([4096, 2160]),
     iou_thr_ensemble: float = 0.5,
     skip_box_thr: float = 0.0001,
-    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
+    max_n_detections: int = 300,
+    confidence_th_post_fusion: float = 0.7,
 ) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
     """Wrap weighted boxes fusion to receive arrays as input.
 
@@ -143,8 +143,11 @@ def wbf_wrapper_arrays(
         Threshold for skipping boxes with confidence below this value.
     max_n_detections: int
         Fused bounding boxes arrays are padded to this total number of boxes.
-        Its value should be larger than the expected maximum number of detections
-        per image after fusing across models.
+        Its value should be larger than the expected maximum number of
+        detections per image after fusing across models.
+    confidence_th_post_fusion: float
+        Threshold for removing fused detections whose confidence is below
+        this value.
 
     Returns
     -------
@@ -155,24 +158,21 @@ def wbf_wrapper_arrays(
 
     """
     # Prepare bboxes for WBF
-    bboxes_x1y1_x2y2_normalised = (
-        np.concat([bboxes_x1y1, bboxes_x2y2], axis=-1)
-        / np.tile(image_width_height, (1, 2))
-    ) #[:, :, :, None]
+    bboxes_x1y1_x2y2_normalised = np.concat(
+        [bboxes_x1y1, bboxes_x2y2], axis=-1
+    ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
 
     # Remove rows with nan coordinates
     n_models = bboxes_x1y1_x2y2_normalised.shape[0]
     list_bboxes_per_model = [
-        arr.squeeze() for arr in np.split(
-            bboxes_x1y1_x2y2_normalised, n_models, axis=0
-        )
+        arr.squeeze()
+        for arr in np.split(bboxes_x1y1_x2y2_normalised, n_models, axis=0)
     ]
     list_bboxes_per_model = [
-        arr[~np.any(np.isnan(arr), axis=1), :]
-        for arr in list_bboxes_per_model
+        arr[~np.any(np.isnan(arr), axis=1), :] for arr in list_bboxes_per_model
     ]
     list_confidence_per_model = [
-        conf_arr.squeeze()[:bbox_arr.shape[0]]
+        conf_arr.squeeze()[: bbox_arr.shape[0]]
         for bbox_arr, conf_arr in zip(
             list_bboxes_per_model,
             np.split(confidence, n_models, axis=0),
@@ -180,7 +180,7 @@ def wbf_wrapper_arrays(
         )
     ]
     list_label_per_model = [
-        label_arr.squeeze()[:bbox_arr.shape[0]]
+        label_arr.squeeze()[: bbox_arr.shape[0]]
         for bbox_arr, label_arr in zip(
             list_bboxes_per_model,
             np.split(label, n_models, axis=0),
@@ -211,9 +211,13 @@ def wbf_wrapper_arrays(
     ]
 
     # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
     ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
-        ~slc_nan_rows
+        ~np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
+    ]
+
+    # Remove rows with confidence below threshold
+    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
+        ensemble_x1y2_x2y2_scores_labels[:, 4] > confidence_th_post_fusion
     ]
 
     # Pad combined array to max_n_detections
@@ -252,11 +256,15 @@ def combine_detections_across_models_wbf(
     kwargs_wbf: dict
         Keyword arguments for the weighted boxes fusion approach. It should
         contain the following keys:
-        - iou_thr_ensemble: IoU threshold for detections to be considered for fusion.
-        - skip_box_thr: Threshold for skipping boxes with confidence below this value.
-        - max_n_detections: Fused bounding boxes arrays are padded to this total number of boxes.
-        Its value should be larger than the expected maximum number of detections
-        per image after fusing across models.
+        - iou_thr_ensemble: IoU threshold for detections to be considered
+        for fusion.
+        - skip_box_thr: Threshold for skipping boxes with confidence below
+        this value.
+        - max_n_detections: Fused bounding boxes arrays are padded to this
+        total number of boxes. Its value should be larger than the expected
+        maximum number of detections per image after fusing across models.
+        - confidence_th_post_fusion: Threshold for removing fused detections
+        whose confidence is below this value.
 
     Returns
     -------
@@ -276,7 +284,7 @@ def combine_detections_across_models_wbf(
     centroid_fused, shape_fused, confidence_fused, label_fused = (
         xr.apply_ufunc(
             wbf_wrapper_arrays,  # ------------#wbf_wrapper_arrays,
-            all_models_detections_ds.xy_min,  # the underlaying .data array is passed
+            all_models_detections_ds.xy_min,  # .data array is passed
             all_models_detections_ds.xy_max,
             all_models_detections_ds.confidence,
             all_models_detections_ds.label,

From 3eb7a47957eb0566ac435158c429284951d16ee2 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 31 Jul 2025 17:32:03 +0100
Subject: [PATCH 57/72] Return image_ids when splitting. Apply confidence th
 after ensembling. add plots

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 486 +++++++-----------
 1 file changed, 176 insertions(+), 310 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 80130fb5..5cd3e53b 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -14,10 +14,7 @@
 from ethology.annotations.io import load_bboxes
 from ethology.datasets.convert import torch_dataset_to_xr_dataset
 from ethology.datasets.create import create_coco_dataset
-from ethology.detectors.ensembles import (
-    combine_detections_across_models_wbf,
-    wbf_wrapper_arrays,
-)
+from ethology.detectors.ensembles import combine_detections_across_models_wbf
 from ethology.detectors.evaluate import compute_precision_recall_ds
 from ethology.detectors.inference import (
     collate_fn_varying_n_bboxes,
@@ -26,10 +23,7 @@
     # run_detector_on_dataset,
 )
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
-from ethology.detectors.utils import (
-    add_bboxes_min_max_corners,
-    detections_x1y1_x2y2_as_da_tuple,
-)
+from ethology.detectors.utils import add_bboxes_min_max_corners
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
     read_config_from_mlflow_params,
@@ -92,6 +86,11 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
         generator=rng_train_split,
     )
 
+    # TODO: return image ids per split-- check!
+    img_ids_coco = dataset_coco.coco.getImgIds()
+    img_ids_train = [img_ids_coco[x] for x in train_dataset.indices]
+    img_ids_test_val = [img_ids_coco[x] for x in test_val_dataset.indices]
+
     # Split test/val sets from the remainder
     test_dataset, val_dataset = random_split(
         test_val_dataset,
@@ -102,12 +101,23 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
         generator=rng_val_split,
     )
 
+    # TODO: return image ids per split -- check!
+    img_ids_test = [img_ids_test_val[x] for x in test_dataset.indices]
+    img_ids_val = [img_ids_test_val[x] for x in val_dataset.indices]
+
     print(f"Seed: {seed_n}")
     print(f"Number of training samples: {len(train_dataset)}")  # images
     print(f"Number of validation samples: {len(val_dataset)}")  # images
     print(f"Number of test samples: {len(test_dataset)}")  # images
 
-    return train_dataset, val_dataset, test_dataset
+    return (
+        train_dataset,
+        val_dataset,
+        test_dataset,
+        img_ids_train,
+        img_ids_val,
+        img_ids_test,
+    )
 
 
 def plot_and_save_ensemble_detections(
@@ -129,6 +139,9 @@ def plot_and_save_ensemble_detections(
     image_cv = (image_cv * 255).astype(np.uint8)
     image_cv = cv2.cvtColor(image_cv, cv2.COLOR_RGB2BGR)
 
+    # create output directory if it does not exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+
     # plot GT annotations as green boxes
     for gt_box in gt_boxes_x1_y1_x2_y2:
         x1, y1, x2, y2 = gt_box.cpu().numpy().astype(int)
@@ -265,14 +278,16 @@ def plot_and_save_ensemble_detections(
 )
 
 # Split dataset like in crabs repo
-train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
-    dataset_coco,
-    seed_n=ref_cli_args["seed_n"],
-    # config=ref_config,
-    config={
-        "train_fraction": 0.0,
-        "val_over_test_fraction": 1.0,
-    },  # only uses train_fraction and val_over_test_fraction
+train_dataset, val_dataset, test_dataset, _, img_ids_val, _ = (
+    split_dataset_crab_repo(
+        dataset_coco,
+        seed_n=ref_cli_args["seed_n"],
+        # config=ref_config,
+        config={
+            "train_fraction": 0.0,
+            "val_over_test_fraction": 1.0,
+        },  # only uses train_fraction and val_over_test_fraction
+    )
 )
 
 print(annotations_file_path)
@@ -317,305 +332,39 @@ def plot_and_save_ensemble_detections(
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Fuse detections across models
-fused_detections_ds = combine_detections_across_models_wbf(  # ------soft_nms?
+confidence_th_post_fusion = 0.7
+fused_detections_ds = combine_detections_across_models_wbf(
     all_models_detections_ds,
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
         "skip_box_thr": 0.0001,
         "max_n_detections": 300,  # set default?
+        "confidence_th_post_fusion": confidence_th_post_fusion,
     },
 )
 
-# %%
-from ensemble_boxes import weighted_boxes_fusion
-
-
-def wbf_wrapper_arrays_2(
-    bboxes_x1y1: np.ndarray,
-    bboxes_x2y2: np.ndarray,  # model, annot, 4
-    confidence: np.ndarray,  # model, annot
-    label: np.ndarray,  # model, annot
-    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
-    iou_thr_ensemble: float = 0.5,
-    skip_box_thr: float = 0.0001,
-    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
-) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
-    """Wrap weighted boxes fusion to receive arrays as input.
-
-    Parameters
-    ----------
-    bboxes_x1y1: np.ndarray
-        Detected bounding boxes in a single imagein x1y1 format, with shape
-        n_models, n_annotations, 2.
-    bboxes_x2y2: np.ndarray
-        Detected bounding boxes in a single image in x2y2 format, with shape
-        n_models, n_annotations, 2.
-    confidence: np.ndarray
-        Confidence scores for each bounding box, with shape
-        n_models, n_annotations.
-    label: np.ndarray
-        Labels for each bounding box, with shape n_models, n_annotations.
-    image_width_height: np.ndarray
-        Width and height of the image, with shape 2.
-    iou_thr_ensemble: float
-        IoU threshold for detections to be considered for fusion.
-    skip_box_thr: float
-        Threshold for skipping boxes with confidence below this value.
-    max_n_detections: int
-        Fused bounding boxes arrays are padded to this total number of boxes.
-        Its value should be larger than the expected maximum number of detections
-        per image after fusing across models.
-
-    Returns
-    -------
-    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
-        Tuple of xr.DataArrays containing the fused detections. The arrays
-        are padded to max_n_detections and contain the data for the centroid,
-        shape, confidence and label of the fused detections.
-
-    """
-    # Prepare bboxes for WBF
-    bboxes_x1y1_x2y2_normalised = np.concat(
-        [bboxes_x1y1, bboxes_x2y2], axis=-1
-    ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
-
-    # ------------------------------------
-    # Run WBF
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = (
-        weighted_boxes_fusion(
-            bboxes_x1y1_x2y2_normalised,
-            confidence,
-            label,
-            iou_thr=iou_thr_ensemble,
-            skip_box_thr=skip_box_thr,
-        )
-    )
-
-    # ------------------------------------
-    # Undo x1y1 x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_width_height, (1, 2)
-    )
-
-    # Combine x1y1, x2y2, scores and labels in one array
-    ensemble_x1y2_x2y2_scores_labels = np.c_[
-        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
-    ]
-
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
-    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
-        ~slc_nan_rows
-    ]
-
-    # Pad combined array to max_n_detections
-    # (this is required to concatenate across image_ids
-    ensemble_x1y2_x2y2_scores_labels = np.pad(
-        ensemble_x1y2_x2y2_scores_labels,
-        (
-            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
-            (0, 0),
-        ),
-        "constant",
-        constant_values=np.nan,
-    )
-
-    # Format output as xarray dataarrays
-    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
-        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
-        ensemble_x1y2_x2y2_scores_labels[:, 4],
-        ensemble_x1y2_x2y2_scores_labels[:, 5],
-    )
-
-    return centroid, shape, confidence, label
-
-
-# %%
-from ensemble_boxes import soft_nms
-
-
-def soft_nms_wrapper_arrays_2(
-    bboxes_x1y1: np.ndarray,
-    bboxes_x2y2: np.ndarray,  # model, annot, 4
-    confidence: np.ndarray,  # model, annot
-    label: np.ndarray,  # model, annot
-    image_width_height: np.ndarray,  # = np.array([4096, 2160]),
-    iou_thr_ensemble: float = 0.5,
-    skip_box_thr: float = 0.0001,
-    max_n_detections: int = 300,  # should be larger than the max number of detections fused per image
-) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
-    """Wrap weighted boxes fusion to receive arrays as input.
-
-    Parameters
-    ----------
-    bboxes_x1y1: np.ndarray
-        Detected bounding boxes in a single imagein x1y1 format, with shape
-        n_models, n_annotations, 2.
-    bboxes_x2y2: np.ndarray
-        Detected bounding boxes in a single image in x2y2 format, with shape
-        n_models, n_annotations, 2.
-    confidence: np.ndarray
-        Confidence scores for each bounding box, with shape
-        n_models, n_annotations.
-    label: np.ndarray
-        Labels for each bounding box, with shape n_models, n_annotations.
-    image_width_height: np.ndarray
-        Width and height of the image, with shape 2.
-    iou_thr_ensemble: float
-        IoU threshold for detections to be considered for fusion.
-    skip_box_thr: float
-        Threshold for skipping boxes with confidence below this value.
-    max_n_detections: int
-        Fused bounding boxes arrays are padded to this total number of boxes.
-        Its value should be larger than the expected maximum number of detections
-        per image after fusing across models.
-
-    Returns
-    -------
-    tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]
-        Tuple of xr.DataArrays containing the fused detections. The arrays
-        are padded to max_n_detections and contain the data for the centroid,
-        shape, confidence and label of the fused detections.
-
-    """
-    # Prepare bboxes for WBF
-    bboxes_x1y1_x2y2_normalised = np.concat(
-        [bboxes_x1y1, bboxes_x2y2], axis=-1
-    ) / np.tile(image_width_height, (1, 2))  # [:, :, :, None]
-
-    # Remove rows with nan coordinates
-    n_models = bboxes_x1y1_x2y2_normalised.shape[0]
-    list_bboxes_per_model = [
-        arr.squeeze() for arr in np.split(
-            bboxes_x1y1_x2y2_normalised, n_models, axis=0
-        )
-    ]
-    list_bboxes_per_model = [
-        arr[~np.any(np.isnan(arr), axis=1), :]
-        for arr in list_bboxes_per_model
-    ]
-    list_confidence_per_model = [
-        conf_arr.squeeze()[:bbox_arr.shape[0]]
-        for bbox_arr, conf_arr in zip(
-            list_bboxes_per_model,
-            np.split(confidence, n_models, axis=0),
-            strict=True,
-        )
-    ]
-    list_label_per_model = [
-        label_arr.squeeze()[:bbox_arr.shape[0]]
-        for bbox_arr, label_arr in zip(
-            list_bboxes_per_model,
-            np.split(label, n_models, axis=0),
-            strict=True,
-        )
-    ]
-    # list_label_per_model = [
-    #     label_arr[bbox_arr.shape[0], :]
-    #     for bbox_arr, label_arr in zip(
-    #         list_bboxes_per_model, np.split(label, n_models, axis=0)
-    #     )
-    # ]
-
-    # ------------------------------------
-    # Run WBF
-    ensemble_x1y1_x2y2_norm, ensemble_scores, ensemble_labels = soft_nms(
-        #bboxes_x1y1_x2y2_normalised,
-        list_bboxes_per_model,
-        list_confidence_per_model,
-        list_label_per_model,
-        iou_thr=iou_thr_ensemble,
-        thresh=skip_box_thr,  # threshold for boxes to keep
-        method=3,  # 1 - linear soft-NMS, 2 - gaussian soft-NMS, 3 - standard NMS
-        sigma=0.5,  # sigma for gaussian soft-NMS
-    )
-
-    # ------------------------------------
-    # Undo x1y1 x2y2 normalization
-    ensemble_x1y1_x2y2 = ensemble_x1y1_x2y2_norm * np.tile(
-        image_width_height, (1, 2)
-    )
-
-    # Combine x1y1, x2y2, scores and labels in one array
-    ensemble_x1y2_x2y2_scores_labels = np.c_[
-        ensemble_x1y1_x2y2, ensemble_scores, ensemble_labels
-    ]
-
-    # Remove rows with nan coordinates
-    slc_nan_rows = np.any(np.isnan(ensemble_x1y1_x2y2), axis=1)
-    ensemble_x1y2_x2y2_scores_labels = ensemble_x1y2_x2y2_scores_labels[
-        ~slc_nan_rows
-    ]
-
-    # Pad combined array to max_n_detections
-    # (this is required to concatenate across image_ids
-    ensemble_x1y2_x2y2_scores_labels = np.pad(
-        ensemble_x1y2_x2y2_scores_labels,
-        (
-            (0, max_n_detections - ensemble_x1y2_x2y2_scores_labels.shape[0]),
-            (0, 0),
-        ),
-        "constant",
-        constant_values=np.nan,
-    )
-
-    # Format output as xarray dataarrays
-    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
-        ensemble_x1y2_x2y2_scores_labels[:, 0:4],
-        ensemble_x1y2_x2y2_scores_labels[:, 4],
-        ensemble_x1y2_x2y2_scores_labels[:, 5],
-    )
+# print(
+#     f"N of total detections: {np.sum(~np.isnan(fused_detections_ds.confidence.values)).item()}"
+# )
 
-    return centroid, shape, confidence, label
-
-
-# %%
-# Prepare kwargs
-kwargs_wbf = {
-    "iou_thr_ensemble": 0.5,
-    "skip_box_thr": 0.0001,
-    "max_n_detections": 500,  # set default?
-}
-kwargs_wbf["image_width_height"] = np.array(
-    [
-        all_models_detections_ds.attrs[img_size]
-        for img_size in ["image_width", "image_height"]
-    ]
-)
-
-# Run WBF vectorized
-centroid_fused, shape_fused, confidence_fused, label_fused = xr.apply_ufunc(
-    soft_nms_wrapper_arrays_2,  # ------------#wbf_wrapper_arrays,
-    all_models_detections_ds.xy_min,  # the underlaying .data array is passed
-    all_models_detections_ds.xy_max,
-    all_models_detections_ds.confidence,
-    all_models_detections_ds.label,
-    kwargs=kwargs_wbf,
-    input_core_dims=[  # do not broadcast across these
-        ["model", "id", "space"],
-        ["model", "id", "space"],
-        ["model", "id"],
-        ["model", "id"],
-    ],
-    output_core_dims=[
-        ["space", "id"],
-        ["space", "id"],
-        ["id"],
-        ["id"],
-    ],
-    vectorize=True,
-    # loop over non-core dims (i.e. image_id);
-    # assumes function only takes arrays over core dims as input
-    exclude_dims={"id"},
-    # to allow dimensions that change size btw input and output
-)
+# plt.figure()
+# plt.hist(fused_detections_ds.confidence.values.reshape(-1,1))
+# plt.show()
 
+# 6 models: 51798
+# 5 models: 50041
+# 4 models: 48314
+# 3 models: 46102
+# 2 models: 40637
+# 1 model: 34917
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define ground truth dataset
 
 # read annotations as a dataset
 print(annotations_file_path)
+
+# %timeit 1min 18s ± 87.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
 # fix category ID (to be fixed in loader)
@@ -631,10 +380,16 @@ def soft_nms_wrapper_arrays_2(
 gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
 
+assert set(img_ids_val) == set(list_image_ids_val)
+
 # %%
+
+# %%
+# %timeit 1min 17s ± 60.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 # Alternatively: convert torch dataset into xarray detections dataset
 # .....
-# is it faster?
+# is it faster? nope...
+# maybe faster if I read image_ids from json file?
 val_ds = torch_dataset_to_xr_dataset(val_dataset)
 
 # # %%
@@ -665,20 +420,21 @@ def soft_nms_wrapper_arrays_2(
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Evaluate
-
-# ensemble model
+# Evaluate ensemble model
 fused_detections_ds, gt_bboxes_val_ds = compute_precision_recall_ds(
     pred_bboxes_ds=fused_detections_ds,
     gt_bboxes_ds=gt_bboxes_val_ds,
     iou_threshold=0.1,  # change to 0.5?
 )
 
+print(
+    f"Ensemble model with confidence threshold post fusion: {confidence_th_post_fusion}"
+)
 print(f"Precision: {fused_detections_ds.precision.mean().values:.4f}")
 print(f"Recall: {fused_detections_ds.recall.mean().values:.4f}")
 
-# %%
-# single models
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Evaluate single models
 list_detections_ds_eval = []
 for k, ds in enumerate(list_detections_ds):
     detections_ds, _ = compute_precision_recall_ds(
@@ -693,15 +449,125 @@ def soft_nms_wrapper_arrays_2(
     print(f"Recall: {detections_ds.recall.mean().values:.4f}")
     print("--------------------------------")
 
+# %%
+# Plot precision and recall for each model
+
+
+avg_precision_per_model = [
+    ds.precision.values.mean() for ds in list_detections_ds_eval
+]
+avg_recall_per_model = [
+    ds.recall.values.mean() for ds in list_detections_ds_eval
+]
+
+## precision and recall
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+# single models
+ax.plot(avg_precision_per_model, ".-", color="blue")
+# ensemble
+ax.axhline(
+    fused_detections_ds.precision.mean().values,
+    color="blue",
+    linestyle="--",
+    label="ensemble",
+)
+ax.set_ylim(0, 1)
+ax.set_ylabel("average precision per frame", color="blue")
+ax.set_xlabel("model")
+
+ax2 = ax.twinx()
+ax2.plot(avg_recall_per_model, ".-", color="red")
+ax2.axhline(
+    fused_detections_ds.recall.mean().values,
+    color="red",
+    linestyle="--",
+    label="ensemble",
+)
+ax2.legend()
+ax2.set_ylim(0, 1)
+ax2.set_ylabel("average recall per frame", color="red")
+
+ax.set_title(
+    "Ensemble vs individual models OOD "
+    f"(confidence th ensemble: {confidence_th_post_fusion})"
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Check calibration of ensemble model
+# see https://docs.xarray.dev/en/latest/user-guide/groupby.html#groupby
+
+bin_edges = np.arange(confidence_th_post_fusion, 1.01, 0.05)
+bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
+
+grouped_ds = fused_detections_ds.groupby_bins(
+    "confidence", bin_edges, restore_coord_dims=True
+)
+print(grouped_ds)
+# grouped_ds = list_detections_ds_eval[0].groupby_bins(
+#     "confidence", bin_edges, restore_coord_dims=True
+# )
+
+# list(grouped_ds) --> tuples (bin_label, ds)
+# list(grouped_ds)[0][1]
+
+# if group is empty: I need to add empty result to list!
+
+# %%
+# plot histogram
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+ax.bar(
+    bin_centers,
+    # [0,] + [g[1].tp.shape[0] for g in list(grouped_ds)],
+    [g[1].tp.shape[0] for g in list(grouped_ds)],
+    width=bin_edges[1] - bin_edges[0],
+    color="skyblue",
+    edgecolor="gray",
+)
+ax.set_xlabel("confidence")
+ax.set_ylabel("detections")
+ax.grid(True, alpha=0.3)
+ax.set_xlim(0, 1)
+
+# %%
+# plot precision per bin
+
+
+def compute_precision(ds_one_bin):
+    return sum(ds_one_bin.tp) / (sum(ds_one_bin.tp) + sum(ds_one_bin.fp))
+
+
+fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+# show bar edges
+ax.bar(
+    0.5 * (bin_edges[:-1] + bin_edges[1:]),
+    # [0,] + [compute_precision(g[1]) for g in list(grouped_ds)],
+    [compute_precision(g[1]) for g in list(grouped_ds)],
+    width=bin_edges[1] - bin_edges[0],
+    color="skyblue",
+    edgecolor="gray",
+)
+ax.plot(
+    0.5 * (bin_edges[:-1] + bin_edges[1:]),
+    0.5 * (bin_edges[:-1] + bin_edges[1:]),  # perfect calibration
+    color="red",
+    linewidth=2,
+    marker="o",
+)
+ax.set_xlabel("confidence")
+ax.set_ylabel("precision")
+ax.grid(True, alpha=0.3)
+ax.set_xlim(0, 1)
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# plot ensemble detections on a selected image
+# Plot ensemble detections on a selected image
 
 # idcs_low_precision = np.argwhere(fused_detections_ds.precision.data < 0.5)
 # idcs_high_precision = np.argwhere(fused_detections_ds.precision.data > 0.9)
 
-fused_detections_ds = detections_ds
+# fused_detections_ds = detections_ds------------<
 idcs_imgs_increasing_precision = np.argsort(fused_detections_ds.precision.data)
-step = 5  # 50
+step = 50  # 50
 
 
 # Get first image
@@ -727,7 +593,7 @@ def soft_nms_wrapper_arrays_2(
         ).confidence.values,
         image_id=gt_annotations["image_id"],
         output_dir=Path(
-            "/home/sminano/swc/project_ethology/0th-percentile-ood-aug2023"
+            f"/home/sminano/swc/project_ethology/ensemble-{confidence_th_post_fusion}-confth-ood-aug2023"
         ),
         extra_str=f"{i:03d}",
         precision=fused_detections_ds.isel(

From ea6431c8e604165e09a98d3cd669a7355bef65df Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:04:09 +0100
Subject: [PATCH 58/72] Add notes to fix

---
 ethology/detectors/ensembles.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
index 4434fa35..d98359d1 100644
--- a/ethology/detectors/ensembles.py
+++ b/ethology/detectors/ensembles.py
@@ -319,6 +319,8 @@ def combine_detections_across_models_wbf(
     label_fused = label_fused.fillna(-1).astype(int)
 
     # Return a dataset
+    # FIX: why is id not a coordinate in the output dataset?
+    # FIX: order of dimensions should be image_id, space, id
     return xr.Dataset(
         data_vars={
             "position": centroid_fused,

From 2b18620576ab15d6cba5ee152c0796c401655475 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:07:19 +0100
Subject: [PATCH 59/72] Combine detections_dict_as_ds and
 detections_dict_as_ds_batch

---
 ethology/detectors/inference.py |  9 +++++----
 ethology/detectors/utils.py     | 33 +++++++++++++++++++--------------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 15a00070..25c43641 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -2,11 +2,11 @@
 
 import pandas as pd
 import torch
+import xarray as xr
 
 from ethology.detectors.utils import (
     concat_detections_ds,
     detections_dict_as_ds,
-    detections_dict_as_ds_batch,
 )
 
 
@@ -78,8 +78,8 @@ def run_detector_on_dataloader(
     model.eval()
 
     # Run detection for each sample in the dataset
-    list_detections_ds = []
-    list_image_ids = []
+    list_detections_ds: list[xr.Dataset] = []
+    list_image_ids: list[int] = []
     for image_batch, annotations_batch in dataloader:
         # Place image batch on device
         image_batch = tuple(image.to(device) for image in image_batch)
@@ -89,7 +89,8 @@ def run_detector_on_dataloader(
 
         # Format as xarray dataset
         # [0] to select single batch dimension
-        detections_ds_batch = detections_dict_as_ds_batch(detections_batch)
+        # detections_ds_batch = detections_dict_as_ds_batch(detections_batch)
+        detections_ds_batch = detections_dict_as_ds(detections_batch)
 
         # Extend lists
         list_detections_ds.extend(detections_ds_batch)
diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index b30db5a9..b2228574 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -9,16 +9,13 @@
 def concat_detections_ds(
     list_detections_ds: list[xr.Dataset], index: pd.Index
 ) -> xr.Dataset:
-    """Concatenate detections datasets along new dimension."""
+    """Concatenate detections datasets along new dimension defined by index."""
     # Check index has name
     if index.name is None:
         raise ValueError("Index must have a name")
 
     # Concatenate along new dimension
-    ds = xr.concat(
-        list_detections_ds,
-        index,
-    )
+    ds = xr.concat(list_detections_ds, index=index)
 
     # ensure "label" array is padded with -1 rather than nan
     if "label" in ds.data_vars:
@@ -27,10 +24,10 @@ def concat_detections_ds(
     return ds
 
 
-def detections_dict_as_ds_batch(
-    list_detections: list[dict],
-) -> list[xr.Dataset]:
-    """Reshape list of detections dictionaries as xarray dataset.
+def detections_dict_as_ds(
+    detections: dict | list[dict],
+) -> xr.Dataset | list[xr.Dataset]:
+    """Reshape detections dictionary(ies) as xarray dataset(s).
 
     Input is list of detections dictionaries with keys:
     - "boxes": tensor of shape [N, 4], x1y1x2y2 in pixels
@@ -38,14 +35,22 @@ def detections_dict_as_ds_batch(
     - "labels": tensor of shape [N]
 
     Output is a list of xarray datasets, one for each image in the batch.
+
+    Pytorch models return a list of detection dictionaries, one for each image
+    in the batch.
     """
-    return [
-        detections_dict_as_ds(detections) for detections in list_detections
-    ]
+    if isinstance(detections, dict):
+        return _detections_dict_as_ds(detections)
+    elif isinstance(detections, list):
+        return [detections_dict_as_ds(det) for det in detections]
+    else:
+        raise ValueError(
+            "Detections must be a dictionary or list of dictionaries"
+        )
 
 
-def detections_dict_as_ds(detections: dict) -> xr.Dataset:
-    """Reshape detections dictionaryas xarray dataset.
+def _detections_dict_as_ds(detections: dict) -> xr.Dataset:
+    """Reshape detections dictionary for a single image as xarray dataset.
 
     Input is detections dictionary with keys:
     - "boxes": tensor of shape [N, 4], x1y1x2y2 in pixels

From 116f5429308ebc45cf49d0c012497a9ec47a944f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:08:30 +0100
Subject: [PATCH 60/72] Add transform from detection ds to movement-like ds.
 Add transform from single image ds to x1y1 x2y2 arrays

---
 ethology/detectors/utils.py | 79 +++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 4 deletions(-)

diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index b2228574..f10f549e 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -1,5 +1,7 @@
 """Utility functions for transforming detection datasets."""
 
+from typing import Literal
+
 import numpy as np
 import pandas as pd
 import torch
@@ -127,10 +129,10 @@ def detections_x1y1_x2y2_as_da_tuple(
 
 def detections_x1y1_x2y2_as_ds(
     x1y1_x2y2_array: np.ndarray,
-    scores_array: np.ndarray,
-    labels_array: np.ndarray,
+    scores_array: np.ndarray,  # rename to confidence
+    labels_array: np.ndarray,  # rename to category
 ) -> xr.Dataset:
-    """Reshape detections array as xarray dataset.
+    """Reshape detections array for a single image as xarray dataset.
 
     Input is detections array with shape [N, 4], x1y1x2y2 in pixels
     """
@@ -157,10 +159,42 @@ def detections_x1y1_x2y2_as_ds(
     )
 
 
+def detections_ds_as_x1y1_x2y2(
+    ds: xr.Dataset,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Express detections dataset for a single image as tuple of numpy arrays.
+
+    The output arrays are:
+    - x1y1_x2y2_array: numpy array of shape [N, 4], x1y1x2y2 in pixels
+    - scores_array: numpy array of shape [N]
+    - labels_array: numpy array of shape [N]
+    """
+    # Add xy_min and xy_max if not present
+    if all([var_str not in ds.variables for var_str in ["xy_min", "xy_max"]]):
+        ds = add_bboxes_min_max_corners(ds)
+
+    # Check dimensions are "space" and "id"
+    if ds.dims != {"space", "id"}:
+        raise ValueError(
+            "Detections dataset must have exactly two dimensions: space and id"
+        )
+
+    # Extract x1y1x2y2 array
+    x1y1_x2y2_array = np.c_[ds.xy_min.values.T, ds.xy_max.values.T]
+
+    # Remove nan rows
+    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
+    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
+    scores_array = ds.confidence.values[~slc_nan_rows]
+    labels_array = ds.label.values[~slc_nan_rows]
+
+    return x1y1_x2y2_array, scores_array, labels_array
+
+
 def add_bboxes_min_max_corners(ds):
     """Add xy_min and xy_max arrays to ds.
 
-    # Compare to box_convert in testing?
+    # Compare to torchvision.ops.box_convert in testing?
     box_convert(
         torch.from_numpy(np.c_[ds.position.T, ds.shape.T]),
         in_fmt="cxcywh",
@@ -170,3 +204,40 @@ def add_bboxes_min_max_corners(ds):
     ds["xy_min"] = ds.position - 0.5 * ds.shape
     ds["xy_max"] = ds.position + 0.5 * ds.shape
     return ds
+
+
+def detections_ds_to_movement_ds(
+    ds: xr.Dataset, type: Literal["poses", "bboxes"]
+) -> xr.Dataset:
+    """Convert detections dataset to movement dataset."""
+    # add id coordinate (FIX this)
+    # ds = ds.assign_coords(
+    #     id=np.arange(ds.sizes['id'])
+    # )
+
+    # ensure relevant dimensions exist
+    if not all(dim in ds.dims for dim in ["image_id", "space", "id"]):
+        raise ValueError(
+            "Detections dataset must have image_id, space, and id dimensions"
+        )
+
+    # if exporting as a poses dataset, add keypoint dimension
+    # as second-to-last dimension
+    if type == "poses":
+        ds = ds.expand_dims("keypoints", axis=-2).assign_coords(
+            keypoints=["centroid"]
+        )
+
+    # remove "label" data array
+    # ds = ds.drop_vars("label")
+
+    # rename dimensions
+    ds = ds.rename({"image_id": "time", "id": "individuals"})
+
+    # make time coordinate a float
+    ds["time"] = ds.time.astype(float)
+
+    # make individuals coordinate a string
+    ds["individuals"] = [f"id_{id}" for id in ds.individuals.values]
+
+    return ds

From 1d11c9577d9c57185438da0e05346837b590b460 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:11:51 +0100
Subject: [PATCH 61/72] Fix wrong kwarg to cocnat

---
 ethology/detectors/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index f10f549e..82ac82e6 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -17,7 +17,7 @@ def concat_detections_ds(
         raise ValueError("Index must have a name")
 
     # Concatenate along new dimension
-    ds = xr.concat(list_detections_ds, index=index)
+    ds = xr.concat(list_detections_ds, index)
 
     # ensure "label" array is padded with -1 rather than nan
     if "label" in ds.data_vars:

From b409308ffccf0d2cea927e40195ef612a54dff89 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:12:46 +0100
Subject: [PATCH 62/72] Run indomain add some comments

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 137 ++++++++----------
 1 file changed, 62 insertions(+), 75 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 5cd3e53b..9646c61f 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -20,7 +20,6 @@
     collate_fn_varying_n_bboxes,
     concat_detections_ds,
     run_detector_on_dataloader,
-    # run_detector_on_dataset,
 )
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
 from ethology.detectors.utils import add_bboxes_min_max_corners
@@ -36,11 +35,9 @@
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Input data
 
-dataset_dir = Path("/home/sminano/swc/project_crabs/data/aug2023-full")
-# Path("/home/sminano/swc/project_crabs/data/sep2023-full")
-annotations_dir = Path(
-    "/home/sminano/swc/project_crabs/data/aug2023-full/annotations"
-)
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+# Path("/home/sminano/swc/project_crabs/data/aug2023-full")
+annotations_dir = dataset_dir / "annotations"
 annotations_file_path = annotations_dir / "VIA_JSON_combined_coco_gen.json"
 
 experiment_ID = "617393114420881798"
@@ -282,11 +279,11 @@ def plot_and_save_ensemble_detections(
     split_dataset_crab_repo(
         dataset_coco,
         seed_n=ref_cli_args["seed_n"],
-        # config=ref_config,
-        config={
-            "train_fraction": 0.0,
-            "val_over_test_fraction": 1.0,
-        },  # only uses train_fraction and val_over_test_fraction
+        config=ref_config, #------
+        # config={
+        #     "train_fraction": 0.0,
+        #     "val_over_test_fraction": 1.0,
+        # },  # only uses train_fraction and val_over_test_fraction
     )
 )
 
@@ -320,7 +317,8 @@ def plot_and_save_ensemble_detections(
         dataloader=val_dataloader,
         device=device,
     )
-    detections_ds = add_bboxes_min_max_corners(detections_ds)
+    detections_ds = add_bboxes_min_max_corners(detections_ds)  
+    # -- this could be done after concatenating if we are not tracking
     list_detections_ds.append(detections_ds)
 
 
@@ -334,7 +332,7 @@ def plot_and_save_ensemble_detections(
 # Fuse detections across models
 confidence_th_post_fusion = 0.7
 fused_detections_ds = combine_detections_across_models_wbf(
-    all_models_detections_ds,
+    all_models_detections_ds.sel(model=[1, 2, 3, 4, 5]),
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
         "skip_box_thr": 0.0001,
@@ -361,28 +359,38 @@ def plot_and_save_ensemble_detections(
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Define ground truth dataset
 
-# read annotations as a dataset
+
 print(annotations_file_path)
 
-# %timeit 1min 18s ± 87.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
 # fix category ID (to be fixed in loader)
 gt_bboxes_ds["category"] = gt_bboxes_ds["category"].where(
     gt_bboxes_ds["category"] != 0, 1
 )
+gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=img_ids_val)
 
-# select only image_id in val_dataset
-# Note that the max number of annotations per image in the val_dataset
-# will stay as in the original dataset (also, category = -1 is not considered
-# an empty value for xarrays .dropna())
-list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
-gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
 
+# # read annotations as a dataset
+# print(annotations_file_path)
 
-assert set(img_ids_val) == set(list_image_ids_val)
+# # %timeit 1min 18s ± 87.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+# gt_bboxes_ds = load_bboxes.from_files(annotations_file_path, format="COCO")
 
-# %%
+# # fix category ID (to be fixed in loader)
+# gt_bboxes_ds["category"] = gt_bboxes_ds["category"].where(
+#     gt_bboxes_ds["category"] != 0, 1
+# )
+
+# # select only image_id in val_dataset
+# # Note that the max number of annotations per image in the val_dataset
+# # will stay as in the original dataset (also, category = -1 is not considered
+# # an empty value for xarrays .dropna())
+# list_image_ids_val = [annot["image_id"] for img, annot in val_dataset]
+# gt_bboxes_val_ds = gt_bboxes_ds.sel(image_id=list_image_ids_val)
+
+
+# assert set(img_ids_val) == set(list_image_ids_val)
 
 # %%
 # %timeit 1min 17s ± 60.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@@ -392,31 +400,6 @@ def plot_and_save_ensemble_detections(
 # maybe faster if I read image_ids from json file?
 val_ds = torch_dataset_to_xr_dataset(val_dataset)
 
-# # %%
-# # check data arrays are the same but with annotations in different order
-# # There is no guarantee that annotation with id=15 is the same in the
-# # xr dataset computed from the annotations file and the one computed from
-# # the torch dataset.
-# for idx in range(len(val_ds.image_id.values)):
-#     idcs_sorted_x1 = np.lexsort(
-#         (
-#             gt_bboxes_val_ds.position.values[idx, 1, :],
-#             gt_bboxes_val_ds.position.values[idx, 0, :],
-#         )
-#     )  # sort by x, then y
-#     idcs_sorted_x2 = np.lexsort(
-#         (
-#             val_ds.position.values[idx, 1, :],
-#             val_ds.position.values[idx, 0, :],
-#         )
-#     )  # sort by x, then y
-#     assert np.allclose(
-#         gt_bboxes_val_ds.position.values[idx, :, idcs_sorted_x1],
-#         val_ds.position.values[idx, :, idcs_sorted_x2],
-#         equal_nan=True,
-#         rtol=1e-5,
-#         atol=1e-8,
-#     )
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -488,7 +471,7 @@ def plot_and_save_ensemble_detections(
 ax2.set_ylabel("average recall per frame", color="red")
 
 ax.set_title(
-    "Ensemble vs individual models OOD "
+    "Ensemble vs individual models in-domain "
     f"(confidence th ensemble: {confidence_th_post_fusion})"
 )
 
@@ -496,6 +479,7 @@ def plot_and_save_ensemble_detections(
 # Check calibration of ensemble model
 # see https://docs.xarray.dev/en/latest/user-guide/groupby.html#groupby
 
+# bins are defined from confidence cutoff
 bin_edges = np.arange(confidence_th_post_fusion, 1.01, 0.05)
 bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
 
@@ -536,6 +520,9 @@ def compute_precision(ds_one_bin):
     return sum(ds_one_bin.tp) / (sum(ds_one_bin.tp) + sum(ds_one_bin.fp))
 
 
+# grouped_ds.apply(compute_precision) throws an error
+
+# %%
 fig, ax = plt.subplots(1, 1, figsize=(10, 6))
 # show bar edges
 ax.bar(
@@ -557,17 +544,17 @@ def compute_precision(ds_one_bin):
 ax.set_ylabel("precision")
 ax.grid(True, alpha=0.3)
 ax.set_xlim(0, 1)
+ax.set_title(
+    "Ensemble model in-domain "
+    f"(confidence th ensemble: {confidence_th_post_fusion})"
+)
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Plot ensemble detections on a selected image
-
-# idcs_low_precision = np.argwhere(fused_detections_ds.precision.data < 0.5)
-# idcs_high_precision = np.argwhere(fused_detections_ds.precision.data > 0.9)
+# Plot ensemble detections on a selected images
 
-# fused_detections_ds = detections_ds------------<
 idcs_imgs_increasing_precision = np.argsort(fused_detections_ds.precision.data)
-step = 50  # 50
+step = 1  # 50
 
 
 # Get first image
@@ -593,7 +580,7 @@ def compute_precision(ds_one_bin):
         ).confidence.values,
         image_id=gt_annotations["image_id"],
         output_dir=Path(
-            f"/home/sminano/swc/project_ethology/ensemble-{confidence_th_post_fusion}-confth-ood-aug2023"
+            f"/home/sminano/swc/project_ethology/ensemble-{confidence_th_post_fusion}-confth-indomain-sep2023"
         ),
         extra_str=f"{i:03d}",
         precision=fused_detections_ds.isel(
@@ -605,25 +592,25 @@ def compute_precision(ds_one_bin):
 # %%
 # %matplotlib widget
 # %%
-fig, ax = plt.subplots()
-ax.hist(fused_detections_ds.precision.values)
-ax.axvline(
-    fused_detections_ds.precision.values.mean(), color="red", linestyle="--"
-)
-ax.set_xlim(0, 1)
-ax.set_xlabel("Precision per frame")
-ax.set_ylabel("count (frames)")
-ax.set_title(f"Precision OOD (n={fused_detections_ds.sizes['image_id']})")
-
-fig, ax = plt.subplots()
-ax.hist(fused_detections_ds.recall.values)
-ax.axvline(
-    fused_detections_ds.recall.values.mean(), color="red", linestyle="--"
-)
-ax.set_xlim(0, 1)
-ax.set_xlabel("Recall per frame")
-ax.set_ylabel("count (frames)")
-ax.set_title(f"Recall OOD (n={fused_detections_ds.sizes['image_id']})")
+# fig, ax = plt.subplots()
+# ax.hist(fused_detections_ds.precision.values)
+# ax.axvline(
+#     fused_detections_ds.precision.values.mean(), color="red", linestyle="--"
+# )
+# ax.set_xlim(0, 1)
+# ax.set_xlabel("Precision per frame")
+# ax.set_ylabel("count (frames)")
+# ax.set_title(f"Precision (n={fused_detections_ds.sizes['image_id']})")
+
+# fig, ax = plt.subplots()
+# ax.hist(fused_detections_ds.recall.values)
+# ax.axvline(
+#     fused_detections_ds.recall.values.mean(), color="red", linestyle="--"
+# )
+# ax.set_xlim(0, 1)
+# ax.set_xlabel("Recall per frame")
+# ax.set_ylabel("count (frames)")
+# ax.set_title(f"Recall (n={fused_detections_ds.sizes['image_id']})")
 
 # plt.show()
 # %%

From d8bee2372a7ef7872356a02953661e312cf1a1d4 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 1 Aug 2025 18:13:44 +0100
Subject: [PATCH 63/72] Add notebook to run ensemble on video (WIP)

---
 notebooks/notebook_run_ensemble_on_video.py | 370 ++++++++++++++++++++
 1 file changed, 370 insertions(+)
 create mode 100644 notebooks/notebook_run_ensemble_on_video.py

diff --git a/notebooks/notebook_run_ensemble_on_video.py b/notebooks/notebook_run_ensemble_on_video.py
new file mode 100644
index 00000000..5ef30408
--- /dev/null
+++ b/notebooks/notebook_run_ensemble_on_video.py
@@ -0,0 +1,370 @@
+# %%
+import csv
+from datetime import datetime
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from boxmot import BotSort
+from movement.io import save_poses
+from tqdm import tqdm
+
+from ethology.detectors.ensembles import combine_detections_across_models_wbf
+from ethology.detectors.inference import (
+    concat_detections_ds,
+    detections_dict_as_ds,
+)
+from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.detectors.utils import (
+    add_bboxes_min_max_corners,
+    detections_ds_as_x1y1_x2y2,
+    detections_ds_to_movement_ds,
+)
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Input data
+
+video_path = Path(
+    "/home/sminano/swc/project_ethology/04.09.2023-04-Right_RE_test.mp4"
+)
+
+experiment_ID = "617393114420881798"
+ml_runs_experiment_dir = (
+    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
+)
+
+# I pick seed 42 for each set of models
+models_dict = {
+    "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
+    "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
+    "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
+    "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
+    "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+print(f"Using device: {device}")
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Helper functions
+def open_video(video_path: str | Path) -> cv2.VideoCapture:
+    """Open video file."""
+    video_object = cv2.VideoCapture(video_path)
+    if not video_object.isOpened():
+        raise Exception("Error opening video file")
+    return video_object
+
+
+def write_tracked_detections_to_csv(
+    csv_file_path: str,
+    tracked_bboxes_dict: dict,
+    frame_name_regexp: str = "frame_{frame_idx:08d}.png",
+    all_frames_size: int = 8888,
+):
+    """Write tracked detections to a csv file."""
+    # Initialise csv file
+    with open(csv_file_path, "w") as csv_file:
+        csv_writer = csv.writer(csv_file)
+
+        # write header following VIA convention
+        # https://www.robots.ox.ac.uk/~vgg/software/via/docs/face_track_annotation.html
+        csv_writer.writerow(
+            (
+                "filename",
+                "file_size",
+                "file_attributes",
+                "region_count",
+                "region_id",
+                "region_shape_attributes",
+                "region_attributes",
+            )
+        )
+
+        # write detections
+        # loop thru frames
+        for frame_idx in tracked_bboxes_dict:
+            # loop thru all boxes in frame
+            for bbox, id, pred_score in zip(
+                tracked_bboxes_dict[frame_idx]["tracked_boxes"],
+                tracked_bboxes_dict[frame_idx]["ids"],
+                tracked_bboxes_dict[frame_idx]["scores"],
+                strict=False,
+            ):
+                # extract shape
+                xmin, ymin, xmax, ymax = bbox
+                width_box = int(xmax - xmin)
+                height_box = int(ymax - ymin)
+
+                # Add to csv
+                csv_writer.writerow(
+                    (
+                        frame_name_regexp.format(
+                            frame_idx=frame_idx
+                        ),  # f"frame_{frame_idx:08d}.png",  # frame index!
+                        all_frames_size,  # frame size
+                        '{{"clip":{}}}'.format("123"),
+                        1,
+                        0,
+                        f'{{"name":"rect","x":{xmin},"y":{ymin},"width":{width_box},"height":{height_box}}}',
+                        f'{{"track":"{int(id)}", "confidence":"{pred_score}"}}',
+                    )
+                )
+
+    return csv_file_path
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define list of models in ensemble
+
+list_models = []
+list_config = []
+list_cli_args = []
+for model_key in models_dict:
+    # Retrieve model config and CLI args from mlflow
+    trained_model_path = str(
+        models_dict[model_key] / "checkpoints" / "last.ckpt"
+    )
+
+    mlflow_params = read_mlflow_params(trained_model_path)
+    config = read_config_from_mlflow_params(mlflow_params)
+    cli_args = read_cli_args_from_mlflow_params(mlflow_params)
+
+    print(
+        f"Run name: {mlflow_params['run_name']}, trained on "
+        f"{Path(cli_args['dataset_dirs'][0]).name}, "
+        f"{Path(cli_args['annotation_files'][0]).name}"
+    )
+    # ------------------------------------
+    # Load model
+    model = load_fasterrcnn_resnet50_fpn_v2(
+        trained_model_path,
+        num_classes=config["num_classes"],
+        device=device,  # device
+    )
+    model.eval()
+    list_models.append(model)
+    list_config.append(config)
+    list_cli_args.append(cli_args)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Check that all models have the same dataset config
+ref_config = list_config[0]
+for key in ["train_fraction", "val_over_test_fraction"]:
+    assert all(config[key] == ref_config[key] for config in list_config)
+
+ref_cli_args = list_cli_args[0]
+assert all(
+    cli_args["seed_n"] == ref_cli_args["seed_n"] for cli_args in list_cli_args
+)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Define transforms for inference
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Helper function: run detector on video
+
+
+def run_detector_on_video(
+    model: torch.nn.Module,
+    video_path: str | Path,
+    device: torch.device,
+    inference_transforms: transforms.Compose,
+) -> xr.Dataset:
+    """Run detection on a video."""
+
+    # Ensure model is in evaluation mode
+    model.eval()
+
+    # Loop over frames
+    list_detections_ds = []
+    list_image_ids = []
+    frame_idx = 0
+    input_video_object = open_video(video_path)
+    # total_n_frames = int(input_video_object.get(cv2.CAP_PROP_FRAME_COUNT))
+    while input_video_object.isOpened():
+        # Read frame
+        ret, frame = input_video_object.read()
+        if not ret:
+            break  # end of video or error
+
+        # Place image tensor on device and add batch dimension
+        image_tensor = inference_transforms(frame).to(device)[None]
+
+        # Run detection
+        with torch.no_grad():
+            # use [0] to select the one image in the batch
+            # Returns: dictionary with data of the predicted bounding boxes.
+            # The keys are: "boxes", "scores", and "labels". The labels
+            # refer to the class of the object detected, and not its ID.
+            detections = model(image_tensor)
+
+        # Format as xarray dataset
+        # [0] to select single batch dimension
+        detections_ds = detections_dict_as_ds(detections[0])
+
+        # Append to list
+        list_detections_ds.append(detections_ds)
+        list_image_ids.append(frame_idx)
+
+        # Update frame index
+        frame_idx += 1
+
+    # Concatenate all detections datasets along image_id dimension
+    detections_dataset = concat_detections_ds(
+        list_detections_ds,
+        pd.Index(list_image_ids, name="image_id"),
+    )
+
+    # Add image_width and image_height as attributes
+    # (we assume all images in the dataset have the same width and height
+    # as the last image)
+    detections_dataset.attrs["image_width"] = image_tensor.shape[-1]  # columns
+    detections_dataset.attrs["image_height"] = image_tensor.shape[-2]  # rows
+
+    return detections_dataset
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Run ensemble on video
+# can I make it faster?
+# can I vectorize this? (pytorch forum question)
+list_detections_ds = []
+for model in tqdm(list_models):
+    model.to(device)
+
+    detections_ds = run_detector_on_video(
+        model=model,
+        video_path=video_path,
+        device=device,
+        inference_transforms=inference_transforms,
+    )
+    # detections_ds = add_bboxes_min_max_corners(detections_ds)
+    list_detections_ds.append(detections_ds)
+
+# Concatenate detections across models
+all_models_detections_ds = concat_detections_ds(
+    list_detections_ds,
+    pd.Index(range(len(list_detections_ds)), name="model"),
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Fuse detections across models
+
+all_models_detections_ds = add_bboxes_min_max_corners(all_models_detections_ds)
+
+confidence_th_post_fusion = 0.4
+fused_detections_ds = combine_detections_across_models_wbf(
+    all_models_detections_ds,
+    kwargs_wbf={
+        "iou_thr_ensemble": 0.5,
+        "skip_box_thr": 0.0001,
+        "max_n_detections": 300,
+        "confidence_th_post_fusion": confidence_th_post_fusion,
+    },
+)
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Track detections using boxmot
+
+# Initialize the tracker
+tracker = BotSort(
+    reid_weights=Path("osnet_x0_25_msmt17.pt"),  # Path to ReID model
+    device=device,  # "0" # why not device? why is this in GPU if we then copy to CPU?
+    half=False,
+)
+
+# Consider detections with confidence > 0.5
+# confidence_th_tracker = 0.5
+# fused_detections_ds = fused_detections_ds.where(
+#     fused_detections_ds.confidence > confidence_th_tracker,
+#     drop=True,  # drops all nan entries
+# )
+
+
+# TODO: vectorize with apply_ufunc?
+list_tracked_ds = []
+for image_id in np.sort(fused_detections_ds.image_id.values):
+    # Convert detections to numpy arrays
+    detections_one_img_ds = fused_detections_ds.sel(image_id=image_id)
+    x1y1_x2y2_array, scores_array, labels_array = detections_ds_as_x1y1_x2y2(
+        detections_one_img_ds
+    )
+    # Update the tracker
+    tracked_boxes_array = tracker.update(
+        np.c_[x1y1_x2y2_array, scores_array, labels_array],
+        # frame, # how can I use image? pass video optionally?
+    )
+    # returns: M X 8, 8 being (x, y, x, y, id, conf, cls, ind)
+    # ind is the index of the corresponding detection in the detections_array
+
+    # Can I do away with reordering the predictions
+    ind = tracked_boxes_array[:, 7]
+    # reorder ids
+    detections_one_img_ds = detections_one_img_ds.reindex({"id": ind})
+    # reset index to 0
+    detections_one_img_ds = detections_one_img_ds.assign_coords(
+        {"id": range(len(ind))}
+    )
+
+    # ds = tracks_x1y1_x2y2_as_ds(
+    #     tracked_boxes_array[:, :4],  # x1y1x2y2
+    #     tracked_boxes_array[:, 4],  # id
+    #     tracked_boxes_array[:, 5],  # confidence
+    #     tracked_boxes_array[:, 6],  # label
+    # )
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format detections dataset as a movement dataset
+
+# add id coordinate (FIX this)
+fused_detections_ds = fused_detections_ds.assign_coords(
+    id=np.arange(fused_detections_ds.sizes["id"])
+)
+
+# format as movement dataset
+fused_detections_as_movement_ds = detections_ds_to_movement_ds(
+    fused_detections_ds, type="poses"
+)
+
+
+# %%
+# save as sleap analysis file
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+save_poses.to_sleap_analysis_file(
+    fused_detections_as_movement_ds,
+    f"detections_ensemble_{video_path.stem}_{timestamp}.h5",
+)
+
+
+# %%

From 1113db31b0f6b0f3f1ccda73226e816586f18954 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 28 Aug 2025 19:13:35 +0100
Subject: [PATCH 64/72] Run ensemble on video first draft

---
 ethology/detectors/utils.py                 |  94 +++++++++++----
 notebooks/notebook_run_ensemble_on_video.py | 123 ++++++++++++++------
 2 files changed, 159 insertions(+), 58 deletions(-)

diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index 82ac82e6..fea092b5 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -78,17 +78,21 @@ def _detections_dict_as_ds(detections: dict) -> xr.Dataset:
     )
 
 
-def detections_x1y1_x2y2_as_da_tuple(
+def x1y1_x2y2_as_da_tuple(
     x1y1_x2y2_array: np.ndarray,
     scores_array: np.ndarray,
     labels_array: np.ndarray,
+    id_array: np.ndarray | None = None,
 ) -> tuple[xr.DataArray, xr.DataArray, xr.DataArray, xr.DataArray]:
-    """Reshape detections array as xarray dataset.
+    """Reshape detections / tracks array as xarray dataset.
 
     Input is detections array with shape [N, 4], x1y1x2y2 in pixels
     """
-    # Create xarray dataset
     n_detections = x1y1_x2y2_array.shape[0]
+    if id_array is None:
+        id_array = np.arange(n_detections)
+
+    # centroid dataarray
     centroid_da = xr.DataArray(
         data=0.5
         * (
@@ -97,10 +101,11 @@ def detections_x1y1_x2y2_as_da_tuple(
         dims=["space", "id"],
         coords={
             "space": ["x", "y"],
-            "id": list(range(n_detections)),
+            "id": id_array,
         },
     )
 
+    # shape dataarray
     shape_da = xr.DataArray(
         data=(
             x1y1_x2y2_array[:, 2:4] - x1y1_x2y2_array[:, 0:2]
@@ -108,20 +113,22 @@ def detections_x1y1_x2y2_as_da_tuple(
         dims=["space", "id"],
         coords={
             "space": ["x", "y"],
-            "id": list(range(n_detections)),
+            "id": id_array,
         },
     )
 
+    # confidence dataarray
     confidence_da = xr.DataArray(
         data=scores_array,
         dims=["id"],
-        coords={"id": list(range(n_detections))},
+        coords={"id": id_array},
     )
 
+    # label dataarray
     label_da = xr.DataArray(
         data=labels_array,
         dims=["id"],
-        coords={"id": list(range(n_detections))},
+        coords={"id": id_array},
     )
 
     return centroid_da, shape_da, confidence_da, label_da
@@ -137,18 +144,50 @@ def detections_x1y1_x2y2_as_ds(
     Input is detections array with shape [N, 4], x1y1x2y2 in pixels
     """
     # Remove nan rows
-    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
-    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
-    scores_array = scores_array[~slc_nan_rows]
-    labels_array = labels_array[~slc_nan_rows]
+    x1y1_x2y2_array, scores_array, labels_array = (
+        remove_rows_with_nan_in_first_array(
+            [x1y1_x2y2_array, scores_array, labels_array]
+        )
+    )
 
     # Create dataarrays for dataset
-    centroid_da, shape_da, confidence_da, label_da = (
-        detections_x1y1_x2y2_as_da_tuple(
-            x1y1_x2y2_array, scores_array, labels_array
+    centroid_da, shape_da, confidence_da, label_da = x1y1_x2y2_as_da_tuple(
+        x1y1_x2y2_array, scores_array, labels_array
+    )
+
+    return xr.Dataset(
+        data_vars={
+            "position": centroid_da,
+            "shape": shape_da,
+            "confidence": confidence_da,
+            "label": label_da,
+        }
+    )
+
+
+def tracks_x1y1_x2y2_as_ds(
+    x1y1_x2y2_array: np.ndarray,
+    scores_array: np.ndarray,  # rename to confidence
+    labels_array: np.ndarray,  # rename to category
+    id_array: np.ndarray,
+) -> xr.Dataset:
+    """Reshape tracks array for a single image as xarray dataset.
+
+    Input is tracks array with shape [N, 4], x1y1x2y2 in pixels
+    and shape [N, 2], id and ind
+    """
+    # Remove nan rows
+    x1y1_x2y2_array, scores_array, labels_array, id_array = (
+        remove_rows_with_nan_in_first_array(
+            [x1y1_x2y2_array, scores_array, labels_array, id_array]
         )
     )
 
+    # Create dataarrays for dataset
+    centroid_da, shape_da, confidence_da, label_da = x1y1_x2y2_as_da_tuple(
+        x1y1_x2y2_array, scores_array, labels_array, id_array
+    )
+
     return xr.Dataset(
         data_vars={
             "position": centroid_da,
@@ -173,20 +212,21 @@ def detections_ds_as_x1y1_x2y2(
     if all([var_str not in ds.variables for var_str in ["xy_min", "xy_max"]]):
         ds = add_bboxes_min_max_corners(ds)
 
-    # Check dimensions are "space" and "id"
-    if ds.dims != {"space", "id"}:
-        raise ValueError(
-            "Detections dataset must have exactly two dimensions: space and id"
-        )
+    # # Check dimensions are "space" and "id"
+    # if ds.dims != {"space", "id"}:
+    #     raise ValueError(
+    #         "Detections dataset must have exactly two dimensions: space and id"
+    #     )
 
     # Extract x1y1x2y2 array
     x1y1_x2y2_array = np.c_[ds.xy_min.values.T, ds.xy_max.values.T]
 
     # Remove nan rows
-    slc_nan_rows = np.any(np.isnan(x1y1_x2y2_array), axis=1)
-    x1y1_x2y2_array = x1y1_x2y2_array[~slc_nan_rows]
-    scores_array = ds.confidence.values[~slc_nan_rows]
-    labels_array = ds.label.values[~slc_nan_rows]
+    x1y1_x2y2_array, scores_array, labels_array = (
+        remove_rows_with_nan_in_first_array(
+            [x1y1_x2y2_array, ds.confidence.values, ds.label.values]
+        )
+    )
 
     return x1y1_x2y2_array, scores_array, labels_array
 
@@ -241,3 +281,11 @@ def detections_ds_to_movement_ds(
     ds["individuals"] = [f"id_{id}" for id in ds.individuals.values]
 
     return ds
+
+
+def remove_rows_with_nan_in_first_array(
+    list_arrays: list[np.ndarray],
+) -> list[np.ndarray]:
+    """Remove rows with nan values from list of arrays."""
+    slc_nan_rows = np.any(np.isnan(list_arrays[0]), axis=1)
+    return [arr[~slc_nan_rows] for arr in list_arrays]
diff --git a/notebooks/notebook_run_ensemble_on_video.py b/notebooks/notebook_run_ensemble_on_video.py
index 5ef30408..5b75a457 100644
--- a/notebooks/notebook_run_ensemble_on_video.py
+++ b/notebooks/notebook_run_ensemble_on_video.py
@@ -9,7 +9,7 @@
 import torch
 import torchvision.transforms.v2 as transforms
 import xarray as xr
-from boxmot import BotSort
+from boxmot import BoostTrack
 from movement.io import save_poses
 from tqdm import tqdm
 
@@ -23,6 +23,7 @@
     add_bboxes_min_max_corners,
     detections_ds_as_x1y1_x2y2,
     detections_ds_to_movement_ds,
+    tracks_x1y1_x2y2_as_ds,
 )
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
@@ -55,6 +56,12 @@
     "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
 }
 
+output_dir = Path("/home/sminano/swc/project_ethology/ensemble_tracking_output")
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Create output directory
+output_dir.mkdir(parents=True, exist_ok=True)
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Set default device: CUDA if available, otherwise mps, otherwise CPU
 device = torch.device(
@@ -293,77 +300,123 @@ def run_detector_on_video(
     },
 )
 
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Format detections as a movement dataset
+
+# add id coordinate (FIX this)
+fused_detections_ds = fused_detections_ds.assign_coords(
+    id=np.arange(fused_detections_ds.sizes["id"])
+)
+
+# format as movement dataset
+fused_detections_as_movement_ds = detections_ds_to_movement_ds(
+    fused_detections_ds, type="poses"
+)
+
+# save as sleap analysis file
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+save_poses.to_sleap_analysis_file(
+    fused_detections_as_movement_ds,
+    output_dir / f"detections_ensemble_{video_path.stem}_{timestamp}.h5",
+)
+
+
+
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Track detections using boxmot
 
 # Initialize the tracker
-tracker = BotSort(
-    reid_weights=Path("osnet_x0_25_msmt17.pt"),  # Path to ReID model
-    device=device,  # "0" # why not device? why is this in GPU if we then copy to CPU?
+# tracker = BotSort(
+#     reid_weights=Path("osnet_x0_25_msmt17.pt"),  # Path to ReID model
+#     device=device,  # "0" # why not device? why is this in GPU if we then copy to CPU?
+#     half=False,
+# )
+
+tracker = BoostTrack(
+    reid_weights=Path("osnet_x0_25_msmt17.pt"),
+    device=device,
     half=False,
+    max_age=1000, # frames
+    min_hits=1,
+    det_thresh=0,  # already filtered by confidence_th_post_fusion
+    iou_threshold=0.1,  # for association
+    aspect_ratio_thresh=1000,
+    min_box_area=0, # no minimum box area
 )
 
-# Consider detections with confidence > 0.5
-# confidence_th_tracker = 0.5
-# fused_detections_ds = fused_detections_ds.where(
-#     fused_detections_ds.confidence > confidence_th_tracker,
-#     drop=True,  # drops all nan entries
-# )
 
+# %%
 
 # TODO: vectorize with apply_ufunc?
 list_tracked_ds = []
+input_video_object = open_video(video_path)
 for image_id in np.sort(fused_detections_ds.image_id.values):
     # Convert detections to numpy arrays
     detections_one_img_ds = fused_detections_ds.sel(image_id=image_id)
     x1y1_x2y2_array, scores_array, labels_array = detections_ds_as_x1y1_x2y2(
         detections_one_img_ds
     )
+
+    # Get frame from video
+    ret, frame = input_video_object.read()
+    if not ret:
+        break
+
     # Update the tracker
+    #   INPUT:  M X (x, y, x, y, conf, cls)
+    #   OUTPUT: M X (x, y, x, y, id, conf, cls, ind)
+    # ind is the index of the corresponding detection in the detections_array
     tracked_boxes_array = tracker.update(
         np.c_[x1y1_x2y2_array, scores_array, labels_array],
-        # frame, # how can I use image? pass video optionally?
+        frame,
     )
-    # returns: M X 8, 8 being (x, y, x, y, id, conf, cls, ind)
-    # ind is the index of the corresponding detection in the detections_array
 
-    # Can I do away with reordering the predictions
-    ind = tracked_boxes_array[:, 7]
-    # reorder ids
-    detections_one_img_ds = detections_one_img_ds.reindex({"id": ind})
-    # reset index to 0
-    detections_one_img_ds = detections_one_img_ds.assign_coords(
-        {"id": range(len(ind))}
+    # # Can I do away with reordering the predictions
+    # ind = tracked_boxes_array[:, -1].astype(int)
+    # # select detections
+    # tracked_ds = detections_one_img_ds.sel(id=ind)
+    # # reorder ids per frame
+    # detections_one_img_ds = detections_one_img_ds.reindex({"id": ind})
+    # # reset index to 0
+    # detections_one_img_ds = detections_one_img_ds.assign_coords(
+    #     {"id": range(len(ind))}
+    # )
+
+    tracked_ds = tracks_x1y1_x2y2_as_ds(
+        tracked_boxes_array[:, :4],  # centroid x1y1x2y2
+        tracked_boxes_array[:, 5],  # confidence
+        tracked_boxes_array[:, 6].astype(int),  # label
+        tracked_boxes_array[:, 4].astype(int),  # id
     )
 
-    # ds = tracks_x1y1_x2y2_as_ds(
-    #     tracked_boxes_array[:, :4],  # x1y1x2y2
-    #     tracked_boxes_array[:, 4],  # id
-    #     tracked_boxes_array[:, 5],  # confidence
-    #     tracked_boxes_array[:, 6],  # label
-    # )
+    # add image_id coordinate
+    tracked_ds = tracked_ds.assign_coords(image_id=image_id)
 
+    list_tracked_ds.append(tracked_ds)
 
+# Concatenate all tracked detections datasets along image_id dimension
+tracked_ds_all_frames = concat_detections_ds(
+    list_tracked_ds, pd.Index(range(len(list_tracked_ds)), name="image_id")
+)
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format detections dataset as a movement dataset
+# Format tracked detections dataset as a movement dataset
 
-# add id coordinate (FIX this)
-fused_detections_ds = fused_detections_ds.assign_coords(
-    id=np.arange(fused_detections_ds.sizes["id"])
+# reindex id coordinate to start from 0
+tracked_ds_all_frames = tracked_ds_all_frames.assign_coords(
+    id=np.arange(tracked_ds_all_frames.sizes["id"])
 )
 
 # format as movement dataset
-fused_detections_as_movement_ds = detections_ds_to_movement_ds(
-    fused_detections_ds, type="poses"
+tracks_as_movement_ds = detections_ds_to_movement_ds(
+    tracked_ds_all_frames, type="poses"
 )
 
 
-# %%
 # save as sleap analysis file
 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 save_poses.to_sleap_analysis_file(
-    fused_detections_as_movement_ds,
-    f"detections_ensemble_{video_path.stem}_{timestamp}.h5",
+    tracks_as_movement_ds,
+    output_dir / f"tracks_ensemble_{video_path.stem}_{timestamp}.h5",
 )
 
 

From ac935677328b4eb6311476890aff555cfe9479a1 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:40:26 +0100
Subject: [PATCH 65/72] Define intervals in calibration curve

---
 notebooks/notebook_evaluate_binned_performance.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index caf1fd43..8539db15 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -342,7 +342,7 @@ def plot_missed_detections_per_bin(
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Compute bins using full GT annotations
+# Compute diagonal bins using full GT annotations
 # We bin the size of the bbox diagonal
 
 coco_full_gt = COCO(str(full_gt_annotations_file))
@@ -442,10 +442,11 @@ def plot_missed_detections_per_bin(
         Path(cli_args["annotation_files"][0]).name
         == "VIA_JSON_combined_coco_gen.json"
     ):
+        # need to use the old annotations file because the new one has 
+        # different image IDs
         dataset_coco = create_coco_dataset(
             images_dir=Path(dataset_dir) / "frames",
-            annotations_file=annotations_dir
-            / "VIA_JSON_combined_coco_gen.json",
+            annotations_file="/home/sminano/swc/project_ethology/sept2023_annotations.bk/VIA_JSON_combined_coco_gen.json",
             composed_transform=inference_transforms,
         )
     else:
@@ -536,6 +537,8 @@ def plot_missed_detections_per_bin(
 predictions_df["confidence_bins"] = pd.cut(
     predictions_df["confidence"],
     bins=bin_edges,
+    right=False,
+    include_lowest=True,
 )
 
 precision_per_confidence_bin = predictions_df.groupby(
@@ -563,6 +566,7 @@ def plot_missed_detections_per_bin(
     ax=ax,
 )
 
+# plot perfect calibration line
 ax.plot(
     np.arange(len(calibration_df)),  # bin indices
     (bin_edges[:-1] + bin_edges[1:]) / 2,  # perfect calibration
@@ -576,7 +580,7 @@ def plot_missed_detections_per_bin(
     f"{model_key} - calibration curve (n={precision_per_confidence_bin.sum()})"
 )
 ax.set_xlabel("confidence")
-ax.set_ylabel("Precision")
+ax.set_ylabel("precision")
 ax.tick_params(axis="x", rotation=45)
 ax.grid(True, alpha=0.3)
 

From 8df13758328774f41f6f704ca029f0db160e9d02 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:40:55 +0100
Subject: [PATCH 66/72] Add description to model

---
 notebooks/notebook_mlflow_plots.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/notebooks/notebook_mlflow_plots.py b/notebooks/notebook_mlflow_plots.py
index b6b58ddf..b1819403 100644
--- a/notebooks/notebook_mlflow_plots.py
+++ b/notebooks/notebook_mlflow_plots.py
@@ -1,7 +1,6 @@
-"""Run detection on a Pytorch dataset and export results as a movement dataset.
+"""Plot precision and recall values for different models.
 
-A script to run detection only (no tracking) on a Pytorch dataset and
-export the results in a format that can be loaded in movement napari widget.
+Uses MLflow output csv files.
 """
 
 # %%
@@ -133,7 +132,7 @@
 # run_slurm_1103832_0_17_val_set_full -- evaluated using corrected full GT annotations
 # (with 0-based image ID)
 csv_file = Path(
-    "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1103832_0_17_val_set_full.csv"
+    "/home/sminano/swc/project_ethology/figs_subset_annotations/run_slurm_1098903_0_17_full_Aug.csv"
 )
 
 # read csv
@@ -163,6 +162,8 @@
 # plot precision and recall
 fig, ax = plt.subplots(figsize=(10, 6))
 
+ymin = 0.4
+
 # Precision plot
 ax.scatter(
     df["percentile"],
@@ -181,7 +182,7 @@
     linewidth=4,
     color="blue",
 )
-ax.set_ylim(0.4, 1.00)
+ax.set_ylim(ymin, 1.00)
 ax.set_xlabel("model trained on bboxes > percentile")
 ax.set_ylabel(f"{eval_set} precision", color="blue")
 ax.tick_params(axis="y", labelcolor="blue")
@@ -204,7 +205,7 @@
     linewidth=4,
     color="red",
 )
-ax2.set_ylim(0.4, 1.00)
+ax2.set_ylim(ymin, 1.00)
 ax2.set_ylabel(f"{eval_set} recall", color="red")
 ax2.tick_params(axis="y", labelcolor="red")
 

From 9fe6826b3417352aa92f2a19b248a4d9917a4c75 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:43:07 +0100
Subject: [PATCH 67/72] Working on ensemble on eval dataset

---
 .../notebook_run_ensemble_on_eval_dataset.py  | 43 +++++++++++++------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 9646c61f..931d2d20 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -279,7 +279,8 @@ def plot_and_save_ensemble_detections(
     split_dataset_crab_repo(
         dataset_coco,
         seed_n=ref_cli_args["seed_n"],
-        config=ref_config, #------
+        config=ref_config, 
+        # if Aug dataset, use:
         # config={
         #     "train_fraction": 0.0,
         #     "val_over_test_fraction": 1.0,
@@ -310,14 +311,23 @@ def plot_and_save_ensemble_detections(
 # can I vectorize this? (pytorch forum question)
 list_detections_ds = []
 for model in tqdm(list_models):
-    model.to(device)
+    # model.to(device)
 
+    # with dataloader
     detections_ds = run_detector_on_dataloader(
         model=model,
         dataloader=val_dataloader,
         device=device,
     )
-    detections_ds = add_bboxes_min_max_corners(detections_ds)  
+
+    # # compare with dataset
+    # detections_ds = run_detector_on_dataset(
+    #     model=model,
+    #     dataset=val_dataset,
+    #     device=device,
+    # )
+
+    detections_ds = add_bboxes_min_max_corners(detections_ds)
     # -- this could be done after concatenating if we are not tracking
     list_detections_ds.append(detections_ds)
 
@@ -330,9 +340,9 @@ def plot_and_save_ensemble_detections(
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Fuse detections across models
-confidence_th_post_fusion = 0.7
+confidence_th_post_fusion = 0.0
 fused_detections_ds = combine_detections_across_models_wbf(
-    all_models_detections_ds.sel(model=[1, 2, 3, 4, 5]),
+    all_models_detections_ds.sel(model=[1,2,3,4,5]),  # before: [1,2,3,4,5]
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
         "skip_box_thr": 0.0001,
@@ -401,7 +411,6 @@ def plot_and_save_ensemble_detections(
 val_ds = torch_dataset_to_xr_dataset(val_dataset)
 
 
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Evaluate ensemble model
 fused_detections_ds, gt_bboxes_val_ds = compute_precision_recall_ds(
@@ -410,6 +419,7 @@ def plot_and_save_ensemble_detections(
     iou_threshold=0.1,  # change to 0.5?
 )
 
+
 print(
     f"Ensemble model with confidence threshold post fusion: {confidence_th_post_fusion}"
 )
@@ -484,7 +494,11 @@ def plot_and_save_ensemble_detections(
 bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
 
 grouped_ds = fused_detections_ds.groupby_bins(
-    "confidence", bin_edges, restore_coord_dims=True
+    "confidence",
+    bin_edges,
+    restore_coord_dims=True,
+    right=False,
+    include_lowest=True,
 )
 print(grouped_ds)
 # grouped_ds = list_detections_ds_eval[0].groupby_bins(
@@ -501,8 +515,8 @@ def plot_and_save_ensemble_detections(
 fig, ax = plt.subplots(1, 1, figsize=(10, 6))
 ax.bar(
     bin_centers,
-    # [0,] + [g[1].tp.shape[0] for g in list(grouped_ds)],
-    [g[1].tp.shape[0] for g in list(grouped_ds)],
+    [0,] + [g[1].tp.shape[0] for g in list(grouped_ds)],
+    # [g[1].tp.shape[0] for g in list(grouped_ds)],
     width=bin_edges[1] - bin_edges[0],
     color="skyblue",
     edgecolor="gray",
@@ -512,8 +526,8 @@ def plot_and_save_ensemble_detections(
 ax.grid(True, alpha=0.3)
 ax.set_xlim(0, 1)
 
-# %%
-# plot precision per bin
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Plot calibration curve
 
 
 def compute_precision(ds_one_bin):
@@ -522,13 +536,14 @@ def compute_precision(ds_one_bin):
 
 # grouped_ds.apply(compute_precision) throws an error
 
-# %%
+# plot precision per bin
 fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+
 # show bar edges
 ax.bar(
     0.5 * (bin_edges[:-1] + bin_edges[1:]),
-    # [0,] + [compute_precision(g[1]) for g in list(grouped_ds)],
-    [compute_precision(g[1]) for g in list(grouped_ds)],
+    [0,] + [compute_precision(g[1]) for g in list(grouped_ds)],
+    # [compute_precision(g[1]) for g in list(grouped_ds)],
     width=bin_edges[1] - bin_edges[0],
     color="skyblue",
     edgecolor="gray",

From 3fd947954af3a5a407b94dd60a51a58533c7cc00 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:44:32 +0100
Subject: [PATCH 68/72] Use botsort for ensemble detections

---
 ethology/detectors/ensembles.py             |  6 ++---
 ethology/detectors/inference.py             |  2 ++
 ethology/detectors/utils.py                 |  2 +-
 notebooks/notebook_run_ensemble_on_video.py | 25 +++++++++++++--------
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/ethology/detectors/ensembles.py b/ethology/detectors/ensembles.py
index d98359d1..9faa5db2 100644
--- a/ethology/detectors/ensembles.py
+++ b/ethology/detectors/ensembles.py
@@ -4,9 +4,7 @@
 import xarray as xr
 from ensemble_boxes import weighted_boxes_fusion
 
-from ethology.detectors.utils import (
-    detections_x1y1_x2y2_as_da_tuple,
-)
+from ethology.detectors.utils import x1y1_x2y2_as_da_tuple
 
 # def soft_nms_wrapper_arrays(
 #     bboxes_x1y1: np.ndarray,
@@ -233,7 +231,7 @@ def wbf_wrapper_arrays(
     )
 
     # Format output as xarray dataarrays
-    centroid, shape, confidence, label = detections_x1y1_x2y2_as_da_tuple(
+    centroid, shape, confidence, label = x1y1_x2y2_as_da_tuple(
         ensemble_x1y2_x2y2_scores_labels[:, 0:4],
         ensemble_x1y2_x2y2_scores_labels[:, 4],
         ensemble_x1y2_x2y2_scores_labels[:, 5],
diff --git a/ethology/detectors/inference.py b/ethology/detectors/inference.py
index 25c43641..faffe4b6 100644
--- a/ethology/detectors/inference.py
+++ b/ethology/detectors/inference.py
@@ -27,6 +27,7 @@ def run_detector_on_dataset(
     """
     # Ensure model is in evaluation mode
     model.eval()
+    model.to(device)
 
     # Run detection for each sample in the dataset
     list_detections_ds = []
@@ -76,6 +77,7 @@ def run_detector_on_dataloader(
     """
     # Ensure model is in evaluation mode
     model.eval()
+    model.to(device)
 
     # Run detection for each sample in the dataset
     list_detections_ds: list[xr.Dataset] = []
diff --git a/ethology/detectors/utils.py b/ethology/detectors/utils.py
index fea092b5..8aae0105 100644
--- a/ethology/detectors/utils.py
+++ b/ethology/detectors/utils.py
@@ -44,7 +44,7 @@ def detections_dict_as_ds(
     if isinstance(detections, dict):
         return _detections_dict_as_ds(detections)
     elif isinstance(detections, list):
-        return [detections_dict_as_ds(det) for det in detections]
+        return [_detections_dict_as_ds(det) for det in detections]
     else:
         raise ValueError(
             "Detections must be a dictionary or list of dictionaries"
diff --git a/notebooks/notebook_run_ensemble_on_video.py b/notebooks/notebook_run_ensemble_on_video.py
index 5b75a457..973b22be 100644
--- a/notebooks/notebook_run_ensemble_on_video.py
+++ b/notebooks/notebook_run_ensemble_on_video.py
@@ -56,7 +56,9 @@
     "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
 }
 
-output_dir = Path("/home/sminano/swc/project_ethology/ensemble_tracking_output")
+output_dir = Path(
+    "/home/sminano/swc/project_ethology/ensemble_tracking_output"
+)
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Create output directory
@@ -208,7 +210,6 @@ def run_detector_on_video(
     inference_transforms: transforms.Compose,
 ) -> xr.Dataset:
     """Run detection on a video."""
-
     # Ensure model is in evaluation mode
     model.eval()
 
@@ -301,7 +302,7 @@ def run_detector_on_video(
 )
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Format detections as a movement dataset
+# Format detections as a movement dataset and export
 
 # add id coordinate (FIX this)
 fused_detections_ds = fused_detections_ds.assign_coords(
@@ -321,28 +322,34 @@ def run_detector_on_video(
 )
 
 
-
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Track detections using boxmot
-
 # Initialize the tracker
 # tracker = BotSort(
 #     reid_weights=Path("osnet_x0_25_msmt17.pt"),  # Path to ReID model
-#     device=device,  # "0" # why not device? why is this in GPU if we then copy to CPU?
+#     device=device,
 #     half=False,
+#     track_high_thresh=0,  # already filtered by confidence_th_post_fusion
+#     track_low_thresh=0,  # already filtered by confidence_th_post_fusion
+#     new_track_thresh=0,  # already filtered by confidence_th_post_fusion
+#     track_buffer=1000,  # frames to keep a track alive after last detection
+#     match_thresh=0.8,  # default 0.8
+#     proximity_thresh=0.1,  # IoU threshold for first-round association
+#     frame_rate=30,
+#     with_reid=False,
 # )
 
 tracker = BoostTrack(
     reid_weights=Path("osnet_x0_25_msmt17.pt"),
     device=device,
     half=False,
-    max_age=1000, # frames
+    max_age=1000,  # frames
     min_hits=1,
     det_thresh=0,  # already filtered by confidence_th_post_fusion
     iou_threshold=0.1,  # for association
     aspect_ratio_thresh=1000,
-    min_box_area=0, # no minimum box area
-)
+    min_box_area=0,  # no minimum box area
+)  # seems better?
 
 
 # %%

From 777cc3db8208fb8d8ca5b01c000e8858c32a915e Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:45:01 +0100
Subject: [PATCH 69/72] add boxsort as dep

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 5061bf16..a10292d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
   "netCDF4",
   "torch",
   "ensemble-boxes",
+  "boxmot",
 ]
 
 [project.urls]

From 96daf683a4c7aae3064589830bdf48196039f725 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:45:33 +0000
Subject: [PATCH 70/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ethology/detectors/evaluate.py                     |  3 +--
 notebooks/notebook_evaluate_binned_performance.py  |  2 +-
 notebooks/notebook_run_ensemble_on_eval_dataset.py | 14 ++++++++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/ethology/detectors/evaluate.py b/ethology/detectors/evaluate.py
index e59ffbbe..b5bc5151 100644
--- a/ethology/detectors/evaluate.py
+++ b/ethology/detectors/evaluate.py
@@ -219,8 +219,7 @@ def compute_precision_recall_ds(
 
     # Compute precision and recall per image
     precision_per_img = pred_bboxes_ds.tp.sum(dim="id") / (
-        pred_bboxes_ds.tp.sum(dim="id")
-        + pred_bboxes_ds.fp.sum(dim="id")
+        pred_bboxes_ds.tp.sum(dim="id") + pred_bboxes_ds.fp.sum(dim="id")
     )
     recall_per_img = pred_bboxes_ds.tp.sum(dim="id") / (
         pred_bboxes_ds.tp.sum(dim="id") + gt_bboxes_ds.md.sum(dim="id")
diff --git a/notebooks/notebook_evaluate_binned_performance.py b/notebooks/notebook_evaluate_binned_performance.py
index 8539db15..240feb20 100644
--- a/notebooks/notebook_evaluate_binned_performance.py
+++ b/notebooks/notebook_evaluate_binned_performance.py
@@ -442,7 +442,7 @@ def plot_missed_detections_per_bin(
         Path(cli_args["annotation_files"][0]).name
         == "VIA_JSON_combined_coco_gen.json"
     ):
-        # need to use the old annotations file because the new one has 
+        # need to use the old annotations file because the new one has
         # different image IDs
         dataset_coco = create_coco_dataset(
             images_dir=Path(dataset_dir) / "frames",
diff --git a/notebooks/notebook_run_ensemble_on_eval_dataset.py b/notebooks/notebook_run_ensemble_on_eval_dataset.py
index 931d2d20..cc176d7b 100644
--- a/notebooks/notebook_run_ensemble_on_eval_dataset.py
+++ b/notebooks/notebook_run_ensemble_on_eval_dataset.py
@@ -279,7 +279,7 @@ def plot_and_save_ensemble_detections(
     split_dataset_crab_repo(
         dataset_coco,
         seed_n=ref_cli_args["seed_n"],
-        config=ref_config, 
+        config=ref_config,
         # if Aug dataset, use:
         # config={
         #     "train_fraction": 0.0,
@@ -342,7 +342,7 @@ def plot_and_save_ensemble_detections(
 # Fuse detections across models
 confidence_th_post_fusion = 0.0
 fused_detections_ds = combine_detections_across_models_wbf(
-    all_models_detections_ds.sel(model=[1,2,3,4,5]),  # before: [1,2,3,4,5]
+    all_models_detections_ds.sel(model=[1, 2, 3, 4, 5]),  # before: [1,2,3,4,5]
     kwargs_wbf={
         "iou_thr_ensemble": 0.5,
         "skip_box_thr": 0.0001,
@@ -515,7 +515,10 @@ def plot_and_save_ensemble_detections(
 fig, ax = plt.subplots(1, 1, figsize=(10, 6))
 ax.bar(
     bin_centers,
-    [0,] + [g[1].tp.shape[0] for g in list(grouped_ds)],
+    [
+        0,
+    ]
+    + [g[1].tp.shape[0] for g in list(grouped_ds)],
     # [g[1].tp.shape[0] for g in list(grouped_ds)],
     width=bin_edges[1] - bin_edges[0],
     color="skyblue",
@@ -542,7 +545,10 @@ def compute_precision(ds_one_bin):
 # show bar edges
 ax.bar(
     0.5 * (bin_edges[:-1] + bin_edges[1:]),
-    [0,] + [compute_precision(g[1]) for g in list(grouped_ds)],
+    [
+        0,
+    ]
+    + [compute_precision(g[1]) for g in list(grouped_ds)],
     # [compute_precision(g[1]) for g in list(grouped_ds)],
     width=bin_edges[1] - bin_edges[0],
     color="skyblue",

From a6abeaeadaa4d62c509d592c9839ccc8c731d087 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 1 Sep 2025 12:47:00 +0100
Subject: [PATCH 71/72] Add detect only notebooks

---
 ethology/trackers/inference.py                |  13 +
 ...otebook_detect_with_ensemble_vectorized.py | 304 ++++++++++++++++++
 .../notebook_run_detection_on_eval_dataset.py |  40 +--
 3 files changed, 326 insertions(+), 31 deletions(-)
 create mode 100644 ethology/trackers/inference.py
 create mode 100644 notebooks/notebook_detect_with_ensemble_vectorized.py

diff --git a/ethology/trackers/inference.py b/ethology/trackers/inference.py
new file mode 100644
index 00000000..6c5eedc4
--- /dev/null
+++ b/ethology/trackers/inference.py
@@ -0,0 +1,13 @@
+from typing import Any
+
+import torch
+import xarray as xr
+
+
+def run_tracker_on_detections_ds(
+    detections_ds: xr.Dataset,
+    tracker: Any,
+    device: torch.device,
+) -> xr.Dataset:
+    """Run tracker on detections dataset."""
+    pass
\ No newline at end of file
diff --git a/notebooks/notebook_detect_with_ensemble_vectorized.py b/notebooks/notebook_detect_with_ensemble_vectorized.py
new file mode 100644
index 00000000..c1842943
--- /dev/null
+++ b/notebooks/notebook_detect_with_ensemble_vectorized.py
@@ -0,0 +1,304 @@
+# %%
+import copy
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.v2 as transforms
+import xarray as xr
+from torch import vmap
+from torch.func import functional_call, stack_module_state
+from torch.utils.data import random_split
+
+from ethology.datasets.create import create_coco_dataset
+from ethology.detectors.inference import run_detector_on_dataset
+from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
+from ethology.mlflow import (
+    read_cli_args_from_mlflow_params,
+    read_config_from_mlflow_params,
+    read_mlflow_params,
+)
+
+# Set xarray options
+xr.set_options(display_expand_attrs=False)
+
+
+# %%
+# Input data
+
+dataset_dir = Path("/home/sminano/swc/project_crabs/data/sep2023-full")
+annotations_dir = Path("/home/sminano/swc/project_ethology/large_annotations")
+annotations_file_path = (
+    annotations_dir / "VIA_JSON_combined_coco_gen_sorted_imageIDs.json"
+)
+
+experiment_ID = "617393114420881798"
+ml_runs_experiment_dir = (
+    Path("/home/sminano/swc/project_crabs/ml-runs") / experiment_ID
+)
+
+# I pick seed 42 for each set of models
+models_dict = {
+    # "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    # "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
+    # "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
+    # "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
+    "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
+    "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
+}
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# Set default device: CUDA if available, otherwise mps, otherwise CPU
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+
+print(f"Using device: {device}")
+
+
+# %%
+# Helper functions
+def split_dataset_crab_repo(dataset_coco, seed_n, config):
+    """Split dataset like in crabs repo."""
+    # Split data into train and test-val sets
+    rng_train_split = torch.Generator().manual_seed(seed_n)
+    rng_val_split = torch.Generator().manual_seed(seed_n)
+
+    # Split train and test-val sets
+    train_dataset, test_val_dataset = random_split(
+        dataset_coco,
+        [config["train_fraction"], 1 - config["train_fraction"]],
+        generator=rng_train_split,
+    )
+
+    # Split test/val sets from the remainder
+    test_dataset, val_dataset = random_split(
+        test_val_dataset,
+        [
+            1 - config["val_over_test_fraction"],
+            config["val_over_test_fraction"],
+        ],
+        generator=rng_val_split,
+    )
+
+    print(f"Seed: {seed_n}")
+    print(f"Number of training samples: {len(train_dataset)}")  # images
+    print(f"Number of validation samples: {len(val_dataset)}")  # images
+    print(f"Number of test samples: {len(test_dataset)}")  # images
+
+    return train_dataset, val_dataset, test_dataset
+
+
+# %%
+# Define list of models in ensemble
+
+list_models = []
+list_config = []
+list_cli_args = []
+for model_key in models_dict:
+    # Retrieve model config and CLI args from mlflow
+    trained_model_path = str(
+        models_dict[model_key] / "checkpoints" / "last.ckpt"
+    )
+
+    mlflow_params = read_mlflow_params(trained_model_path)
+    config = read_config_from_mlflow_params(mlflow_params)
+    cli_args = read_cli_args_from_mlflow_params(mlflow_params)
+
+    # ------------------------------------
+    # Load model
+    model = load_fasterrcnn_resnet50_fpn_v2(
+        trained_model_path,
+        num_classes=config["num_classes"],
+        device=None,  # device
+    )
+    model.eval()
+    list_models.append(model)
+    list_config.append(config)
+    list_cli_args.append(cli_args)
+
+
+# %%
+# Check that all models have the same dataset config
+ref_config = list_config[0]
+for key in ["train_fraction", "val_over_test_fraction"]:
+    assert all(config[key] == ref_config[key] for config in list_config)
+
+ref_cli_args = list_cli_args[0]
+assert all(
+    cli_args["seed_n"] == ref_cli_args["seed_n"] for cli_args in list_cli_args
+)
+
+# %%
+# Define common dataset for ensemble
+
+# Define transforms for inference
+inference_transforms = transforms.Compose(
+    [
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+# Create COCO dataset
+dataset_coco = create_coco_dataset(
+    images_dir=Path(dataset_dir) / "frames",
+    annotations_file=annotations_file_path,
+    composed_transform=inference_transforms,
+)
+
+# Split dataset like in crabs repo
+train_dataset, val_dataset, test_dataset = split_dataset_crab_repo(
+    dataset_coco,
+    seed_n=ref_cli_args["seed_n"],
+    config=ref_config,  # only uses train_fraction and val_over_test_fraction
+)
+
+
+# %%
+# Check output of a single model
+
+# model = list_models[0]
+# model.to(device)
+
+# img, annot = val_dataset[0]
+
+# with torch.no_grad():
+#     detections = model(img.to(device)[None])
+
+# a list with one dict per element in the batch, each with keys:
+# - boxes: tensor of shape [N, 4]
+# - scores: tensor of shape [N]
+# - labels: tensor of shape [N]
+
+
+# %%
+# Naive
+
+# predictions_val_set_per_model = []
+# with torch.no_grad():
+#     for model in list_models:
+#         model.to(device)
+#         predictions_val_set_per_model.append(
+#             [model(img.to(device)[None])[0] for img, _annot in val_dataset_subset]
+#         )
+# # [None]  # [1, C, H, W] -- add batch dimension
+
+# %%
+# Vectorized
+
+# set a max number of detections per image
+max_detections_per_image = 200
+
+# stack params and buffers across models
+# Given a list of M nn.Modules of the same class,
+# returns two dictionaries that stack all of their parameters
+# and buffers together, indexed by name
+params, buffers = stack_module_state(list_models)
+
+
+# define fn for vmap
+single_meta_model = copy.deepcopy(list_models[0]).to("meta")
+
+
+def wrapper_model(params, buffers, img):
+    # Performs a functional call on the module by replacing the
+    # module parameters and buffers with the provided ones.
+    # Returns the result of calling `single_meta_model`
+    list_detection_dicts = functional_call(
+        single_meta_model,
+        (params, buffers),
+        (img,),  # [B, C, H, W]
+        strict=True,
+    )  # one dict per element in the batch
+
+    # pad to 200 detections per image
+    list_detection_dicts_padded = []
+    for detection_dict in list_detection_dicts:
+        n_detections = detection_dict["boxes"].shape[0]
+        detection_dict_padded = {}
+
+        detection_dict_padded["boxes"] = F.pad(
+            detection_dict["boxes"],
+            (0, 0, 0, max_detections_per_image - n_detections),
+            mode="constant",
+            value=np.nan,
+        )
+        detection_dict_padded["scores"] = F.pad(
+            detection_dict["scores"],
+            (0, max_detections_per_image - n_detections),
+            mode="constant",
+            value=np.nan,
+        )
+        detection_dict_padded["labels"] = F.pad(
+            detection_dict["labels"],
+            (0, max_detections_per_image - n_detections),
+            mode="constant",
+            value=-1,
+        )
+
+        list_detection_dicts_padded.append(detection_dict_padded)
+
+    return list_detection_dicts_padded
+
+
+# %%
+# Run wrapper function on single model
+
+model = list_models[0]
+
+model.eval() #
+
+params_one_model = dict(model.named_parameters())
+buffers_one_model = dict(model.named_buffers())
+
+# place on device
+params_one_model = {k: v.to(device) for k, v in params_one_model.items()}
+buffers_one_model = {k: v.to(device) for k, v in buffers_one_model.items()}
+
+# get data
+val_dataset_subset = torch.utils.data.Subset(val_dataset, range(1))
+val_dataset_images = torch.stack(
+    [img.to(device) for img, _annot in val_dataset_subset]
+)
+
+# %%
+out = wrapper_model(params_one_model, buffers_one_model, val_dataset_images)
+# %%
+
+
+
+
+# %%
+# place params and buffers on device
+# (rather than models  + params and buffers)
+params = {k: v.to(device) for k, v in params.items()}
+buffers = {k: v.to(device) for k, v in buffers.items()}
+
+
+# prepare data for vmap
+val_dataset_subset = torch.utils.data.Subset(val_dataset, range(1))
+
+val_dataset_images = torch.stack(
+    [img.to(device) for img, _annot in val_dataset_subset]
+)
+
+
+# %%
+
+# compute predictions using vmap
+# in_dims Specifies which dimension of the inputs to `fmodel` should be mapped over.
+predictions_val_set_per_model_vmap = vmap(wrapper_model, in_dims=(0, 0, None))(
+    params,
+    buffers,
+    val_dataset_images,
+)
+
+
+
+# %%
diff --git a/notebooks/notebook_run_detection_on_eval_dataset.py b/notebooks/notebook_run_detection_on_eval_dataset.py
index 66f5d96d..0e0010aa 100644
--- a/notebooks/notebook_run_detection_on_eval_dataset.py
+++ b/notebooks/notebook_run_detection_on_eval_dataset.py
@@ -39,12 +39,12 @@
 
 # I pick seed 42 for each set of models
 models_dict = {
-    "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
+    # "above_0th": ml_runs_experiment_dir / "f348d9d196934073bece1b877cbc4d38",
     "above_1st": ml_runs_experiment_dir / "879d2f77e2b24adcb06b87d2fede6a04",
-    "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
-    "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
-    "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
-    "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
+    # "above_5th": ml_runs_experiment_dir / "75583ec227e3444ab692b99c64795325",
+    # "above_10th": ml_runs_experiment_dir / "4acc37206b1e4f679d535c837bee2c2f",
+    # "above_25th": ml_runs_experiment_dir / "fdcf88fcbcc84fbeb94b45ca6b6f8914",
+    # "above_50th": ml_runs_experiment_dir / "daa05ded0ea047388c9134bf044061c5",
 }
 
 output_dir = Path(
@@ -175,34 +175,12 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     detections_ds.attrs["coco_crabs_dataset_split"] = "val"
 
     # ------------------------------------
-    # Save detections dataset
-    detections_ds.to_netcdf(
-        output_dir
-        / f"{model_key}_detections_val_set_seed_{cli_args['seed_n']}_{timestamp}.nc"
-    )
-
-    # # Save evaluation dataset with pickle
-    # with open(
+    # # Save detections dataset
+    # detections_ds.to_netcdf(
     #     output_dir
-    #     / f"{model_key}_evaluation_val_set_seed_{cli_args['seed_n']}_{timestamp}.pkl",
-    #     "wb",
-    # ) as f:
-    #     pickle.dump(val_dataset, f)
+    #     / f"{model_key}_detections_val_set_seed_{cli_args['seed_n']}_{timestamp}.nc"
+    # )
 
-# %%
-# # reshape
-# detections_per_validation_sample = {}
-# for val_idx in range(len(val_dataset)):
-#     detections_dict = detections_dict_per_sample[val_idx]
-#     bboxes_xyxy = detections_dict["boxes"].cpu().numpy()
-
-#     detections_per_validation_sample[val_idx] = {
-#         "bbox_xyxy": bboxes_xyxy,
-#         "bbox_centroids": (bboxes_xyxy[:, 0:2] + bboxes_xyxy[:, 2:4]) / 2,
-#         "bbox_shapes": bboxes_xyxy[:, 2:4] - bboxes_xyxy[:, 0:2],
-#         "bbox_confidences": detections_dict["scores"].cpu().numpy(),
-#         "bbox_labels": detections_dict["labels"].cpu().numpy(),
-#     }
 
 
 # %%%%%%%%%%%%%%%%%%%%%%%

From 5d6095997dafaca9053143745e75a326804e210f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Sep 2025 11:47:57 +0000
Subject: [PATCH 72/72] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ethology/trackers/inference.py                        | 2 +-
 notebooks/notebook_detect_with_ensemble_vectorized.py | 6 +-----
 notebooks/notebook_run_detection_on_eval_dataset.py   | 1 -
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/ethology/trackers/inference.py b/ethology/trackers/inference.py
index 6c5eedc4..a0d52b12 100644
--- a/ethology/trackers/inference.py
+++ b/ethology/trackers/inference.py
@@ -10,4 +10,4 @@ def run_tracker_on_detections_ds(
     device: torch.device,
 ) -> xr.Dataset:
     """Run tracker on detections dataset."""
-    pass
\ No newline at end of file
+    pass
diff --git a/notebooks/notebook_detect_with_ensemble_vectorized.py b/notebooks/notebook_detect_with_ensemble_vectorized.py
index c1842943..0e4a9db1 100644
--- a/notebooks/notebook_detect_with_ensemble_vectorized.py
+++ b/notebooks/notebook_detect_with_ensemble_vectorized.py
@@ -12,7 +12,6 @@
 from torch.utils.data import random_split
 
 from ethology.datasets.create import create_coco_dataset
-from ethology.detectors.inference import run_detector_on_dataset
 from ethology.detectors.load import load_fasterrcnn_resnet50_fpn_v2
 from ethology.mlflow import (
     read_cli_args_from_mlflow_params,
@@ -252,7 +251,7 @@ def wrapper_model(params, buffers, img):
 
 model = list_models[0]
 
-model.eval() #
+model.eval()  #
 
 params_one_model = dict(model.named_parameters())
 buffers_one_model = dict(model.named_buffers())
@@ -272,8 +271,6 @@ def wrapper_model(params, buffers, img):
 # %%
 
 
-
-
 # %%
 # place params and buffers on device
 # (rather than models  + params and buffers)
@@ -300,5 +297,4 @@ def wrapper_model(params, buffers, img):
 )
 
 
-
 # %%
diff --git a/notebooks/notebook_run_detection_on_eval_dataset.py b/notebooks/notebook_run_detection_on_eval_dataset.py
index 0e0010aa..be82aa81 100644
--- a/notebooks/notebook_run_detection_on_eval_dataset.py
+++ b/notebooks/notebook_run_detection_on_eval_dataset.py
@@ -182,7 +182,6 @@ def split_dataset_crab_repo(dataset_coco, seed_n, config):
     # )
 
 
-
 # %%%%%%%%%%%%%%%%%%%%%%%
 # %%time
 # Use dataloader to run detection on validation set