neuroinformatics-unit · sfmig · Dec 12, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -86,3 +86,6 @@ uv.lock
 
 # written by setuptools_scm
 **/_version.py
+
+# uv
+uv.lock
diff --git a/ethology/io/annotations/save_bboxes.py b/ethology/io/annotations/save_bboxes.py
@@ -102,10 +102,7 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame:
     """Get preliminary dataframe from a dataset of bounding boxes annotations.
 
     If the dataset has an "image_shape" array, the returned dataframe
-    will have "image_shape_x" and "image_shape_y" columns. The returned
-    dataframe will have a "category" column, filled with the relevant category
-    values, or filled with -1 if no category array was present in the
-    original dataset.
+    will have "image_shape_x" and "image_shape_y" columns.
 
     The returned dataframe is not COCO-exportable.
 
@@ -128,15 +125,15 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame:
     # (where at least one of the specified columns contains a NaN value.)
     df_raw = df_raw.dropna(subset=["position", "shape"])
 
-    # Add "category" column if not present
-    if "category" not in df_raw.columns:
-        df_raw["category"] = -1
-
     # Pivot the dataframe to get position_x, position_y, shape_x, shape_y, etc.
-    index_cols = ["image_id", "id", "category"]
-    pivot_values = ["position", "shape"]
-    if "image_shape" in df_raw.columns:
-        pivot_values.append("image_shape")
+    # pivot_values: variables with x and y values
+    # index_cols: variables **without** x and y values
+    pivot_values = [
+        c for c in ["position", "shape", "image_shape"] if c in df_raw.columns
+    ]
+    index_cols = [
+        c for c in df_raw.columns if c not in {*pivot_values, "space"}
+    ]
 
     df_raw = df_raw.pivot_table(
         index=index_cols,
@@ -238,17 +235,22 @@ def _add_COCO_data_to_df(
         ]
     )
 
-    # Rename "category" to "category_id" (in dataset it is an integer)
-    # and compute "category" as string from "category_id"
-    map_category_to_str = ds_attrs["map_category_to_str"]
+    # Rename "category" to "category_id"
+    # (in input dataset "category" is an integer, but in COCO it is a str)
     df.rename(columns={"category": "category_id"}, inplace=True)
-    df["category"] = df["category_id"].map(map_category_to_str)
+    # and compute "category" as a string from "category_id"
+    map_category_to_str = ds_attrs["map_category_to_str"]
+    df["category"] = df["category_id"].map(
+        lambda x: map_category_to_str.get(x, "")
+    )  # set value to "" if category ID is not defined in map_category_to_str
 
-    # supercategory
+    # Set supercategory to empty string if not defined
     if "supercategory" not in df.columns:
         df["supercategory"] = ""
+    else:
+        df["supercategory"] = df["supercategory"].astype(str)
 
-    # other
+    # Set iscrowd always to 0
     df["iscrowd"] = 0
 
     # Set index name and add "annotation_id" as column

diff --git a/ethology/validators/annotations.py b/ethology/validators/annotations.py
@@ -219,9 +219,10 @@ class ValidBboxAnnotationsDataset(ValidDataset):
 
     - is an xarray Dataset,
     - has ``image_id``, ``space``, ``id`` as dimensions,
-    - has ``position`` and ``shape`` as data variables,
-    - both data variables span at least the dimensions ``image_id``,
+    - has ``position``, ``shape`` and ``category`` as data variables,
+    - ``position`` and ``shape`` span at least the dimensions ``image_id``,
       ``space`` and ``id``.
+    - ``category`` spans at least the dimensions ``image_id`` and ``id``.
 
 
     Attributes
@@ -237,6 +238,7 @@ class ValidBboxAnnotationsDataset(ValidDataset):
 
         - ``position`` maps to ``image_id``, ``space`` and ``id``,
         - ``shape`` maps to ``image_id``, ``space`` and ``id``.
+        - ``category`` maps to ``image_id`` and ``id``.
 
     Raises
     ------
@@ -259,6 +261,7 @@ class ValidBboxAnnotationsDataset(ValidDataset):
     required_data_vars: ClassVar[dict[str, set]] = {
         "position": {"image_id", "space", "id"},
         "shape": {"image_id", "space", "id"},
+        "category": {"image_id", "id"},
     }
 
 

diff --git a/ethology/validators/detections.py b/ethology/validators/detections.py
@@ -15,10 +15,12 @@ class ValidBboxDetectionsDataset(ValidDataset):
 
     - is an xarray Dataset,
     - has ``image_id``, ``space``, ``id`` as dimensions,
-    - has ``position``, ``shape`` and ``confidence`` as data variables,
+    - has ``position``, ``shape``, ``category`` and ``confidence`` as data
+      variables,
     - ``position`` and ``shape`` span at least the dimensions ``image_id``,
       ``space`` and ``id``,
-    - ``confidence`` spans at least the dimensions ``image_id`` and ``id``.
+    - ``category`` and ``confidence`` span at least the dimensions
+      ``image_id`` and ``id``.
 
 
     Attributes
@@ -34,6 +36,7 @@ class ValidBboxDetectionsDataset(ValidDataset):
 
         - ``position`` maps to ``image_id``, ``space`` and ``id``,
         - ``shape`` maps to ``image_id``, ``space`` and ``id``,
+        - ``category`` maps to ``image_id`` and ``id``,
         - ``confidence`` maps to ``image_id`` and ``id``.
 
     Raises
@@ -57,5 +60,6 @@ class ValidBboxDetectionsDataset(ValidDataset):
     required_data_vars: ClassVar[dict[str, set]] = {
         "position": {"image_id", "space", "id"},
         "shape": {"image_id", "space", "id"},
+        "category": {"image_id", "id"},
         "confidence": {"image_id", "id"},
     }
diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py
@@ -130,12 +130,14 @@ def valid_bbox_annotations_dataset():
         (len(image_ids), len(space_dims), len(annotation_ids))
     )
     shape_data = np.copy(position_data)
+    category_data = np.ones((len(image_ids), len(annotation_ids)))
 
     # Create the dataset
     ds = xr.Dataset(
         data_vars={
             "position": (["image_id", "space", "id"], position_data),
             "shape": (["image_id", "space", "id"], shape_data),
+            "category": (["image_id", "id"], category_data),
         },
         coords={
             "image_id": image_ids,

diff --git a/tests/test_unit/test_io_annotations/test_save_bboxes.py b/tests/test_unit/test_io_annotations/test_save_bboxes.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pandera.pandas as pa
 import pytest
+import xarray as xr
 
 from ethology.io.annotations.load_bboxes import from_files
 from ethology.io.annotations.save_bboxes import (
@@ -238,30 +239,25 @@ def test_validate_bboxes_df_COCO(
 def test_get_raw_df_from_ds(
     annotations_test_data: dict, input_file: str, drop_variables: bool
 ):
-    """Test the function that gets the raw dataframe derived from the xarray
-    dataset fills in the appropriate category values, and includes the image
-    shape columns if present in the original dataset.
+    """Test that the function that computes the raw dataframe from the xarray
+    dataset includes the image shape columns, if they are present in the
+    original dataset.
     """
+    # Read input dataset
     input_file = annotations_test_data[input_file]
     format: Literal["VIA", "COCO"] = (
         "VIA" if "VIA" in str(input_file) else "COCO"
     )
     ds = from_files(input_file, format=format)
 
-    # Drop data arrays if specified
+    # Drop "image_shape" data array if required
     if drop_variables:
-        vars_to_drop = [
-            var
-            for var in ["category", "image_shape"]
-            if var in list(ds.data_vars.keys())
-        ]
-        ds = ds.drop_vars(vars_to_drop)  # type: ignore
+        ds = ds.drop_vars("image_shape")  # type: ignore
 
     # Get raw dataframe
     df_raw = _get_raw_df_from_ds(ds)
 
-    # The "category" column should always be present in the raw dataframe,
-    # even if the category array was not present in the original dataset
+    # List of expected columns
     list_expected_columns = [
         "image_id",
         "id",
@@ -350,6 +346,56 @@ def test_add_COCO_data_to_df(annotations_test_data: dict):
     assert all(df_output["iscrowd"] == 0)
 
 
+def test_add_COCO_data_to_df_empty_category(annotations_test_data):
+    """Test that if the category ID is not included in map_category_to_str
+    the category name is mapped to an empty string.
+    """
+    # Read input file as bboxes dataset
+    input_file = annotations_test_data["small_bboxes_COCO.json"]
+    ds = from_files(input_file, format="COCO")
+
+    # Change map from category IDs to strings to a
+    # category ID that is not present in the dataset
+    assert 999 not in ds.map_category_to_str
+    ds.attrs["map_category_to_str"] = {999: "foo"}
+
+    # Get raw dataframe
+    df_raw = _get_raw_df_from_ds(ds)
+
+    # Fill in missing columns with defaults
+    df_output = _add_COCO_data_to_df(df_raw, ds.attrs)
+
+    # Check category name is an empty string
+    assert all(df_output["category"] == "")
+
+
+@pytest.mark.parametrize("supercategory_value", [999, "foo", True])
+def test_add_COCO_data_to_df_empty_supercategory(
+    annotations_test_data, supercategory_value
+):
+    """Test that if defined, the supercategory data variable is cast to str."""
+    # Read input file as bbox annotations dataset
+    input_file = annotations_test_data["small_bboxes_COCO.json"]
+    ds = from_files(input_file, format="COCO")
+
+    # Fill dataset with supercategory as data variable
+    ds["supercategory"] = xr.full_like(
+        ds.category,
+        fill_value=supercategory_value,
+        dtype=object,
+    )
+
+    # Get raw dataframe
+    df_raw = _get_raw_df_from_ds(ds)
+
+    # Fill in missing columns with defaults
+    df_output = _add_COCO_data_to_df(df_raw, ds.attrs)
+
+    # Check supercategory name has expected value and is cast as string
+    assert df_output["supercategory"].apply(lambda x: isinstance(x, str)).all()
+    assert all(df_output["supercategory"] == str(supercategory_value))
+
+
 def test_create_COCO_dict(sample_bboxes_df: Callable):
     """Test the function that transforms the modified bboxes dataframe to
     a COCO dictionary.

diff --git a/tests/test_unit/test_validators/test_annotations.py b/tests/test_unit/test_validators/test_annotations.py
@@ -238,6 +238,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict):
                         ["image_id", "space", "id", "foo"],
                         np.zeros((3, 2, 2, 1)),
                     ),
+                    "category": (
+                        ["image_id", "id", "foo"],
+                        np.ones((3, 2, 1)),
+                    ),
                 },
             ),
             does_not_raise(),
@@ -260,6 +264,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict):
                         ["image_id", "space", "id"],
                         np.zeros((3, 2, 2)),
                     ),
+                    "category": (
+                        ["image_id", "id"],
+                        np.ones((3, 2)),
+                    ),
                 },
             ),
             pytest.raises(ValueError),
@@ -280,14 +288,18 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict):
                 },
             ),
             pytest.raises(ValueError),
-            "Missing required data variables: ['position', 'shape']",
+            (
+                "Missing required data variables: "
+                "['category', 'position', 'shape']"
+            ),
         ),
         (
             xr.Dataset(
                 coords={"image_id": np.arange(3), "id": np.arange(2)},
                 data_vars={
                     "position": (["image_id", "id"], np.zeros((3, 2))),
                     "shape": (["image_id", "id"], np.zeros((3, 2))),
+                    "category": (["image_id", "id"], np.ones((3, 2))),
                 },
             ),
             pytest.raises(ValueError),
@@ -309,6 +321,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict):
                         ["foo", "bar", "id"],
                         np.zeros((3, 2, 2)),
                     ),
+                    "category": (
+                        ["foo", "id"],
+                        np.ones((3, 2)),
+                    ),
                 },
             ),
             pytest.raises(ValueError),
@@ -330,6 +346,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict):
                         ["image_id", "id"],
                         np.zeros((3, 2)),
                     ),
+                    "category": (
+                        ["image_id", "id"],
+                        np.ones((3, 2)),
+                    ),
                 },
             ),
             pytest.raises(ValueError),
@@ -377,4 +397,5 @@ def test_validator_bbox_annotations_dataset(
         assert validator.required_data_vars == {
             "position": {"id", "image_id", "space"},
             "shape": {"id", "image_id", "space"},
+            "category": {"id", "image_id"},
         }
-Original file line number
+Diff line change
@@ Expand Up / @@ -86,3 +86,6 @@ uv.lock @@
     # written by setuptools_scm
     **/_version.py
+    # uv
+    uv.lock