diff --git a/.gitignore b/.gitignore index 53fa4926..2e0f2e6e 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,6 @@ uv.lock # written by setuptools_scm **/_version.py + +# uv +uv.lock diff --git a/ethology/io/annotations/save_bboxes.py b/ethology/io/annotations/save_bboxes.py index a21ecf93..65a3b268 100644 --- a/ethology/io/annotations/save_bboxes.py +++ b/ethology/io/annotations/save_bboxes.py @@ -102,10 +102,7 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: """Get preliminary dataframe from a dataset of bounding boxes annotations. If the dataset has an "image_shape" array, the returned dataframe - will have "image_shape_x" and "image_shape_y" columns. The returned - dataframe will have a "category" column, filled with the relevant category - values, or filled with -1 if no category array was present in the - original dataset. + will have "image_shape_x" and "image_shape_y" columns. The returned dataframe is not COCO-exportable. @@ -128,15 +125,15 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: # (where at least one of the specified columns contains a NaN value.) df_raw = df_raw.dropna(subset=["position", "shape"]) - # Add "category" column if not present - if "category" not in df_raw.columns: - df_raw["category"] = -1 - # Pivot the dataframe to get position_x, position_y, shape_x, shape_y, etc. - index_cols = ["image_id", "id", "category"] - pivot_values = ["position", "shape"] - if "image_shape" in df_raw.columns: - pivot_values.append("image_shape") + # pivot_values: variables with x and y values + # index_cols: variables **without** x and y values + pivot_values = [ + c for c in ["position", "shape", "image_shape"] if c in df_raw.columns + ] + index_cols = [ + c for c in df_raw.columns if c not in {*pivot_values, "space"} + ] df_raw = df_raw.pivot_table( index=index_cols, @@ -238,17 +235,22 @@ def _add_COCO_data_to_df( ] ) - # Rename "category" to "category_id" (in dataset it is an integer) - # and compute "category" as string from "category_id" - map_category_to_str = ds_attrs["map_category_to_str"] + # Rename "category" to "category_id" + # (in input dataset "category" is an integer, but in COCO it is a str) df.rename(columns={"category": "category_id"}, inplace=True) - df["category"] = df["category_id"].map(map_category_to_str) + # and compute "category" as a string from "category_id" + map_category_to_str = ds_attrs["map_category_to_str"] + df["category"] = df["category_id"].map( + lambda x: map_category_to_str.get(x, "") + ) # set value to "" if category ID is not defined in map_category_to_str - # supercategory + # Set supercategory to empty string if not defined if "supercategory" not in df.columns: df["supercategory"] = "" + else: + df["supercategory"] = df["supercategory"].astype(str) - # other + # Set iscrowd always to 0 df["iscrowd"] = 0 # Set index name and add "annotation_id" as column diff --git a/ethology/validators/annotations.py b/ethology/validators/annotations.py index 427440c3..4507e0ae 100644 --- a/ethology/validators/annotations.py +++ b/ethology/validators/annotations.py @@ -219,9 +219,10 @@ class ValidBboxAnnotationsDataset(ValidDataset): - is an xarray Dataset, - has ``image_id``, ``space``, ``id`` as dimensions, - - has ``position`` and ``shape`` as data variables, - - both data variables span at least the dimensions ``image_id``, + - has ``position``, ``shape`` and ``category`` as data variables, + - ``position`` and ``shape`` span at least the dimensions ``image_id``, ``space`` and ``id``. + - ``category`` spans at least the dimensions ``image_id`` and ``id``. Attributes @@ -237,6 +238,7 @@ class ValidBboxAnnotationsDataset(ValidDataset): - ``position`` maps to ``image_id``, ``space`` and ``id``, - ``shape`` maps to ``image_id``, ``space`` and ``id``. + - ``category`` maps to ``image_id`` and ``id``. Raises ------ @@ -259,6 +261,7 @@ class ValidBboxAnnotationsDataset(ValidDataset): required_data_vars: ClassVar[dict[str, set]] = { "position": {"image_id", "space", "id"}, "shape": {"image_id", "space", "id"}, + "category": {"image_id", "id"}, } diff --git a/ethology/validators/detections.py b/ethology/validators/detections.py index 8b75f5b2..d46c9802 100644 --- a/ethology/validators/detections.py +++ b/ethology/validators/detections.py @@ -15,10 +15,12 @@ class ValidBboxDetectionsDataset(ValidDataset): - is an xarray Dataset, - has ``image_id``, ``space``, ``id`` as dimensions, - - has ``position``, ``shape`` and ``confidence`` as data variables, + - has ``position``, ``shape``, ``category`` and ``confidence`` as data + variables, - ``position`` and ``shape`` span at least the dimensions ``image_id``, ``space`` and ``id``, - - ``confidence`` spans at least the dimensions ``image_id`` and ``id``. + - ``category`` and ``confidence`` span at least the dimensions + ``image_id`` and ``id``. Attributes @@ -34,6 +36,7 @@ class ValidBboxDetectionsDataset(ValidDataset): - ``position`` maps to ``image_id``, ``space`` and ``id``, - ``shape`` maps to ``image_id``, ``space`` and ``id``, + - ``category`` maps to ``image_id`` and ``id``, - ``confidence`` maps to ``image_id`` and ``id``. Raises @@ -57,5 +60,6 @@ class ValidBboxDetectionsDataset(ValidDataset): required_data_vars: ClassVar[dict[str, set]] = { "position": {"image_id", "space", "id"}, "shape": {"image_id", "space", "id"}, + "category": {"image_id", "id"}, "confidence": {"image_id", "id"}, } diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py index eb3830fa..49cc9415 100644 --- a/tests/fixtures/annotations.py +++ b/tests/fixtures/annotations.py @@ -130,12 +130,14 @@ def valid_bbox_annotations_dataset(): (len(image_ids), len(space_dims), len(annotation_ids)) ) shape_data = np.copy(position_data) + category_data = np.ones((len(image_ids), len(annotation_ids))) # Create the dataset ds = xr.Dataset( data_vars={ "position": (["image_id", "space", "id"], position_data), "shape": (["image_id", "space", "id"], shape_data), + "category": (["image_id", "id"], category_data), }, coords={ "image_id": image_ids, diff --git a/tests/test_unit/test_io_annotations/test_save_bboxes.py b/tests/test_unit/test_io_annotations/test_save_bboxes.py index 21a0c60f..b668c4a0 100644 --- a/tests/test_unit/test_io_annotations/test_save_bboxes.py +++ b/tests/test_unit/test_io_annotations/test_save_bboxes.py @@ -8,6 +8,7 @@ import pandas as pd import pandera.pandas as pa import pytest +import xarray as xr from ethology.io.annotations.load_bboxes import from_files from ethology.io.annotations.save_bboxes import ( @@ -238,30 +239,25 @@ def test_validate_bboxes_df_COCO( def test_get_raw_df_from_ds( annotations_test_data: dict, input_file: str, drop_variables: bool ): - """Test the function that gets the raw dataframe derived from the xarray - dataset fills in the appropriate category values, and includes the image - shape columns if present in the original dataset. + """Test that the function that computes the raw dataframe from the xarray + dataset includes the image shape columns, if they are present in the + original dataset. """ + # Read input dataset input_file = annotations_test_data[input_file] format: Literal["VIA", "COCO"] = ( "VIA" if "VIA" in str(input_file) else "COCO" ) ds = from_files(input_file, format=format) - # Drop data arrays if specified + # Drop "image_shape" data array if required if drop_variables: - vars_to_drop = [ - var - for var in ["category", "image_shape"] - if var in list(ds.data_vars.keys()) - ] - ds = ds.drop_vars(vars_to_drop) # type: ignore + ds = ds.drop_vars("image_shape") # type: ignore # Get raw dataframe df_raw = _get_raw_df_from_ds(ds) - # The "category" column should always be present in the raw dataframe, - # even if the category array was not present in the original dataset + # List of expected columns list_expected_columns = [ "image_id", "id", @@ -350,6 +346,56 @@ def test_add_COCO_data_to_df(annotations_test_data: dict): assert all(df_output["iscrowd"] == 0) +def test_add_COCO_data_to_df_empty_category(annotations_test_data): + """Test that if the category ID is not included in map_category_to_str + the category name is mapped to an empty string. + """ + # Read input file as bboxes dataset + input_file = annotations_test_data["small_bboxes_COCO.json"] + ds = from_files(input_file, format="COCO") + + # Change map from category IDs to strings to a + # category ID that is not present in the dataset + assert 999 not in ds.map_category_to_str + ds.attrs["map_category_to_str"] = {999: "foo"} + + # Get raw dataframe + df_raw = _get_raw_df_from_ds(ds) + + # Fill in missing columns with defaults + df_output = _add_COCO_data_to_df(df_raw, ds.attrs) + + # Check category name is an empty string + assert all(df_output["category"] == "") + + +@pytest.mark.parametrize("supercategory_value", [999, "foo", True]) +def test_add_COCO_data_to_df_empty_supercategory( + annotations_test_data, supercategory_value +): + """Test that if defined, the supercategory data variable is cast to str.""" + # Read input file as bbox annotations dataset + input_file = annotations_test_data["small_bboxes_COCO.json"] + ds = from_files(input_file, format="COCO") + + # Fill dataset with supercategory as data variable + ds["supercategory"] = xr.full_like( + ds.category, + fill_value=supercategory_value, + dtype=object, + ) + + # Get raw dataframe + df_raw = _get_raw_df_from_ds(ds) + + # Fill in missing columns with defaults + df_output = _add_COCO_data_to_df(df_raw, ds.attrs) + + # Check supercategory name has expected value and is cast as string + assert df_output["supercategory"].apply(lambda x: isinstance(x, str)).all() + assert all(df_output["supercategory"] == str(supercategory_value)) + + def test_create_COCO_dict(sample_bboxes_df: Callable): """Test the function that transforms the modified bboxes dataframe to a COCO dictionary. diff --git a/tests/test_unit/test_validators/test_annotations.py b/tests/test_unit/test_validators/test_annotations.py index fbcc77ee..93a15713 100644 --- a/tests/test_unit/test_validators/test_annotations.py +++ b/tests/test_unit/test_validators/test_annotations.py @@ -238,6 +238,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ["image_id", "space", "id", "foo"], np.zeros((3, 2, 2, 1)), ), + "category": ( + ["image_id", "id", "foo"], + np.ones((3, 2, 1)), + ), }, ), does_not_raise(), @@ -260,6 +264,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ["image_id", "space", "id"], np.zeros((3, 2, 2)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), }, ), pytest.raises(ValueError), @@ -280,7 +288,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): }, ), pytest.raises(ValueError), - "Missing required data variables: ['position', 'shape']", + ( + "Missing required data variables: " + "['category', 'position', 'shape']" + ), ), ( xr.Dataset( @@ -288,6 +299,7 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): data_vars={ "position": (["image_id", "id"], np.zeros((3, 2))), "shape": (["image_id", "id"], np.zeros((3, 2))), + "category": (["image_id", "id"], np.ones((3, 2))), }, ), pytest.raises(ValueError), @@ -309,6 +321,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ["foo", "bar", "id"], np.zeros((3, 2, 2)), ), + "category": ( + ["foo", "id"], + np.ones((3, 2)), + ), }, ), pytest.raises(ValueError), @@ -330,6 +346,10 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ["image_id", "id"], np.zeros((3, 2)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), }, ), pytest.raises(ValueError), @@ -377,4 +397,5 @@ def test_validator_bbox_annotations_dataset( assert validator.required_data_vars == { "position": {"id", "image_id", "space"}, "shape": {"id", "image_id", "space"}, + "category": {"id", "image_id"}, } diff --git a/tests/test_unit/test_validators/test_detections.py b/tests/test_unit/test_validators/test_detections.py index d053d6ef..53f704f9 100644 --- a/tests/test_unit/test_validators/test_detections.py +++ b/tests/test_unit/test_validators/test_detections.py @@ -19,6 +19,7 @@ def valid_bbox_detections_dataset(): (len(image_ids), len(space_dims), len(annotation_ids)) ) shape_data = np.copy(position_data) + category_data = np.ones((len(image_ids), len(annotation_ids))) confidence_data = np.zeros((len(image_ids), len(annotation_ids))) # Create the dataset @@ -26,6 +27,7 @@ def valid_bbox_detections_dataset(): data_vars={ "position": (["image_id", "space", "id"], position_data), "shape": (["image_id", "space", "id"], shape_data), + "category": (["image_id", "id"], category_data), "confidence": (["image_id", "id"], confidence_data), }, coords={ @@ -42,6 +44,11 @@ def valid_bbox_detections_dataset(): def valid_bbox_detections_dataset_extra_vars_and_dims( valid_bbox_detections_dataset: xr.Dataset, ) -> xr.Dataset: + """Create a valid bbox detections dataset for validation. + + The dataset is valid but contains more variables and dimensions than + the minimum required for a bbox detections dataset. + """ ds = valid_bbox_detections_dataset.copy(deep=True) ds.coords["extra_dim"] = [10, 20, 30] ds["extra_var_1"] = (["image_id"], np.random.rand(len(ds.image_id))) @@ -78,6 +85,10 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ["image_id", "space", "id", "foo"], np.zeros((3, 2, 2, 1)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), "confidence": ( ["image_id", "id"], np.zeros((3, 2)), @@ -108,6 +119,10 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ["image_id", "space", "id"], np.zeros((3, 2, 2)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), }, ), pytest.raises(ValueError), @@ -125,6 +140,10 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ["image_id", "space", "id"], np.zeros((3, 2, 2)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), }, ), pytest.raises(ValueError), @@ -136,6 +155,7 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( data_vars={ "position": (["image_id", "id"], np.zeros((3, 2))), "shape": (["image_id", "id"], np.zeros((3, 2))), + "category": (["image_id", "id"], np.ones((3, 2))), "confidence": (["image_id", "id"], np.zeros((3, 2))), }, ), @@ -158,6 +178,10 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ["foo", "bar", "id"], np.zeros((3, 2, 2)), ), + "category": ( + ["foo", "id"], + np.ones((3, 2)), + ), "confidence": ( ["foo", "id"], np.zeros((3, 2)), @@ -183,6 +207,10 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ["image_id", "id"], np.zeros((3, 2)), ), + "category": ( + ["image_id", "id"], + np.ones((3, 2)), + ), "confidence": ( ["image_id", "id"], np.zeros((3, 2)), @@ -234,5 +262,6 @@ def test_validator_bbox_detections_dataset( assert validator.required_data_vars == { "position": {"image_id", "space", "id"}, "shape": {"image_id", "space", "id"}, + "category": {"image_id", "id"}, "confidence": {"image_id", "id"}, }