From a2c0be36262b1781f90723da2f6a72457951740e Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:43:44 +0000 Subject: [PATCH 01/13] Add detections validator --- ethology/io/annotations/validate.py | 75 ++--------------- ethology/io/detections/validate.py | 48 +++++++++++ ethology/io/validate.py | 121 ++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 68 deletions(-) create mode 100644 ethology/io/detections/validate.py create mode 100644 ethology/io/validate.py diff --git a/ethology/io/annotations/validate.py b/ethology/io/annotations/validate.py index 2e00ab92..04b81a60 100644 --- a/ethology/io/annotations/validate.py +++ b/ethology/io/annotations/validate.py @@ -1,13 +1,10 @@ """Validators for annotation files and datasets.""" import json -from collections.abc import Callable -from functools import wraps from pathlib import Path import pandas as pd import pandera.pandas as pa -import xarray as xr from attrs import define, field from pandera.typing import Index @@ -17,6 +14,7 @@ _check_required_keys_in_dict, _get_default_schema, ) +from ethology.io.validate import ValidDataset @define @@ -227,7 +225,7 @@ def _file_contains_unique_image_IDs(self, attribute, value): @define -class ValidBboxesDataset: +class ValidBboxAnnotationsDataset(ValidDataset): """Class for valid ``ethology`` bounding box annotations datasets. It checks that the input dataset has: @@ -239,6 +237,10 @@ class ValidBboxesDataset: ---------- dataset : xarray.Dataset The xarray dataset to validate. + required_dims : set + Set of required dimension names. + required_data_vars : set + Set of required data variable names. Raises ------ @@ -254,9 +256,7 @@ class ValidBboxesDataset: """ - dataset: xr.Dataset = field() - - # Minimum requirements for annotations datasets holding bboxes + # Minimum requirements for a bbox dataset holding detections required_dims: set = field( default={"image_id", "space", "id"}, init=False, @@ -266,32 +266,6 @@ class ValidBboxesDataset: init=False, ) - @dataset.validator - def _check_dataset_type(self, attribute, value): - """Ensure the input is an xarray Dataset.""" - if not isinstance(value, xr.Dataset): - raise TypeError( - f"Expected an xarray Dataset, but got {type(value)}." - ) - - @dataset.validator - def _check_required_data_variables(self, attribute, value): - """Ensure the dataset has all required data variables.""" - missing_vars = self.required_data_vars - set(value.data_vars) - if missing_vars: - raise ValueError( - f"Missing required data variables: {sorted(missing_vars)}" - ) - - @dataset.validator - def _check_required_dimensions(self, attribute, value): - """Ensure the dataset has all required dimensions.""" - missing_dims = self.required_dims - set(value.dims) - if missing_dims: - raise ValueError( - f"Missing required dimensions: {sorted(missing_dims)}" - ) - class ValidBboxesDataFrame(pa.DataFrameModel): """Class for valid bounding boxes intermediate dataframes. @@ -573,38 +547,3 @@ def check_idx_and_annotation_id(cls, df: pd.DataFrame) -> bool: """ return all(df.index == df["annotation_id"]) - - -def _check_output(validator: type): - """Return a decorator that validates the output of a function.""" - - def decorator(function: Callable) -> Callable: - @wraps(function) # to preserve function metadata - def wrapper(*args, **kwargs): - result = function(*args, **kwargs) - validator(result) - return result - - return wrapper - - return decorator - - -def _check_input(validator: type, input_index: int = 0): - """Return a decorator that validates a specific input of a function. - - By default, the first input is validated. If the input index is - larger than the number of inputs, no validation is performed. - """ - - def decorator(function: Callable) -> Callable: - @wraps(function) - def wrapper(*args, **kwargs): - if len(args) > input_index: - validator(args[input_index]) - result = function(*args, **kwargs) - return result - - return wrapper - - return decorator diff --git a/ethology/io/detections/validate.py b/ethology/io/detections/validate.py new file mode 100644 index 00000000..7ef6285d --- /dev/null +++ b/ethology/io/detections/validate.py @@ -0,0 +1,48 @@ +"""Validators for detection datasets.""" + +from attrs import define, field + +from ethology.io.validate import ValidDataset + + +@define +class ValidBboxDetectionsDataset(ValidDataset): + """Class for valid ``ethology`` bounding box detections datasets. + + It checks that the input dataset has: + + - ``image_id``, ``space``, ``id`` as dimensions + - ``position``, ``shape`` and ``confidence`` as data variables + + Attributes + ---------- + dataset : xarray.Dataset + The xarray dataset to validate. + required_dims : set + Set of required dimension names. + required_data_vars : set + Set of required data variable names. + + Raises + ------ + TypeError + If the input is not an xarray Dataset. + ValueError + If the dataset is missing required data variables or dimensions. + + Notes + ----- + The dataset can have other data variables and dimensions, but only the + required ones are checked. + + """ + + # Minimum requirements for a bbox dataset holding detections + required_dims: set = field( + default={"image_id", "space", "id"}, + init=False, + ) + required_data_vars: set = field( + default={"position", "shape", "confidence"}, + init=False, + ) diff --git a/ethology/io/validate.py b/ethology/io/validate.py new file mode 100644 index 00000000..22c215f9 --- /dev/null +++ b/ethology/io/validate.py @@ -0,0 +1,121 @@ +"""Utils for validating `ethology` objects.""" + +from abc import ABC, abstractmethod +from collections.abc import Callable +from functools import wraps + +import xarray as xr +from attrs import define, field + + +@define +class ValidDataset(ABC): + """An abstract base class for valid ``ethology`` datasets. + + It checks that the input dataset has: + + - required dimensions + - required data variables + + Subclasses must define ``required_dims`` and ``required_data_vars`` + attributes. + + Attributes + ---------- + dataset : xarray.Dataset + The xarray dataset to validate. + required_dims : set + Set of required dimension names (defined by subclasses). + required_data_vars : set + Set of required data variable names (defined by subclasses). + + Raises + ------ + TypeError + If the input is not an xarray Dataset. + ValueError + If the dataset is missing required data variables or dimensions. + + Notes + ----- + The dataset can have other data variables and dimensions, but only the + required ones are checked. + + """ + + dataset: xr.Dataset = field() + + # Subclasses should override these abstract properties + @property + @abstractmethod + def required_dims(self) -> set: + """Subclasses must provide a required_dims property.""" + pass + + @property + @abstractmethod + def required_data_vars(self) -> set: + """Subclasses must provide a required_data_vars property.""" + pass + + # Validators + @dataset.validator + def _check_dataset_type(self, attribute, value): + """Ensure the input is an xarray Dataset.""" + if not isinstance(value, xr.Dataset): + raise TypeError( + f"Expected an xarray Dataset, but got {type(value)}." + ) + + @dataset.validator + def _check_required_data_variables(self, attribute, value): + """Ensure the dataset has all required data variables.""" + missing_vars = self.required_data_vars - set(value.data_vars) + if missing_vars: + raise ValueError( + f"Missing required data variables: {sorted(missing_vars)}" + ) + + @dataset.validator + def _check_required_dimensions(self, attribute, value): + """Ensure the dataset has all required dimensions.""" + missing_dims = self.required_dims - set(value.dims) + if missing_dims: + raise ValueError( + f"Missing required dimensions: {sorted(missing_dims)}" + ) + + +def _check_output(validator: type): + """Return a decorator that validates the output of a function.""" + + def decorator(function: Callable) -> Callable: + @wraps(function) # to preserve function metadata + def wrapper(*args, **kwargs): + result = function(*args, **kwargs) + validator(result) + return result + + return wrapper + + return decorator + + +def _check_input(validator: type, input_index: int = 0): + """Return a decorator that validates a specific input of a function. + + By default, the first input is validated. If the input index is + larger than the number of inputs, no validation is performed. + """ + + def decorator(function: Callable) -> Callable: + @wraps(function) + def wrapper(*args, **kwargs): + if len(args) > input_index: + validator(args[input_index]) + result = function(*args, **kwargs) + return result + + return wrapper + + return decorator From 9e4b70fb0a758b448760595678c94ae5b033222c Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 15:55:27 +0000 Subject: [PATCH 02/13] Rename validators for consistency --- ethology/io/annotations/load_bboxes.py | 24 ++++++++++-------- ethology/io/annotations/save_bboxes.py | 25 ++++++++++--------- ethology/io/annotations/validate.py | 4 +-- .../test_io_annotations/test_save_bboxes.py | 8 +++--- .../test_io_annotations/test_validators.py | 6 ++--- 5 files changed, 35 insertions(+), 32 deletions(-) diff --git a/ethology/io/annotations/load_bboxes.py b/ethology/io/annotations/load_bboxes.py index d59d0abc..f70de2d4 100644 --- a/ethology/io/annotations/load_bboxes.py +++ b/ethology/io/annotations/load_bboxes.py @@ -11,15 +11,15 @@ from pandera.typing.pandas import DataFrame from ethology.io.annotations.validate import ( - ValidBboxesDataFrame, - ValidBboxesDataset, + ValidBboxAnnotationsDataFrame, + ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, - _check_output, ) +from ethology.io.validate import _check_output -@_check_output(ValidBboxesDataset) +@_check_output(ValidBboxAnnotationsDataset) def from_files( file_paths: Path | str | list[Path | str], format: Literal["VIA", "COCO"], @@ -138,7 +138,7 @@ def from_files( def _get_map_attributes_from_df( - df: DataFrame[ValidBboxesDataFrame], + df: DataFrame[ValidBboxAnnotationsDataFrame], ) -> tuple[dict, dict]: """Get the map attributes from the dataframe. @@ -179,7 +179,7 @@ def _get_map_attributes_from_df( @pa.check_types def _df_from_multiple_files( list_filepaths: list[Path | str], format: Literal["VIA", "COCO"] -) -> DataFrame[ValidBboxesDataFrame]: +) -> DataFrame[ValidBboxAnnotationsDataFrame]: """Read annotations from multiple files as a valid intermediate dataframe. Parameters @@ -242,7 +242,7 @@ def _df_from_multiple_files( @pa.check_types def _df_from_single_file( file_path: Path | str, format: Literal["VIA", "COCO"] -) -> DataFrame[ValidBboxesDataFrame]: +) -> DataFrame[ValidBboxAnnotationsDataFrame]: """Read annotations from a single file as a valid intermediate dataframe. Parameters @@ -374,7 +374,7 @@ def _df_rows_from_valid_VIA_file(file_path: Path) -> list[dict]: else: supercategory, category, category_id = ( - ValidBboxesDataFrame.get_empty_values()[key] + ValidBboxAnnotationsDataFrame.get_empty_values()[key] for key in ["supercategory", "category", "category_id"] ) @@ -428,7 +428,7 @@ def _get_image_shape_attr_as_integer( ValidBboxesDataFrame.get_empty_values(). """ - default_value = ValidBboxesDataFrame.get_empty_values()[ + default_value = ValidBboxAnnotationsDataFrame.get_empty_values()[ f"image_{attr_name}" ] try: @@ -557,7 +557,9 @@ def _df_rows_from_valid_COCO_file(file_path: Path) -> list[dict]: @pa.check_types -def _df_to_xarray_ds(df: DataFrame[ValidBboxesDataFrame]) -> xr.Dataset: +def _df_to_xarray_ds( + df: DataFrame[ValidBboxAnnotationsDataFrame], +) -> xr.Dataset: """Convert a bounding box annotations dataframe to an xarray dataset. Parameters @@ -585,7 +587,7 @@ def _df_to_xarray_ds(df: DataFrame[ValidBboxesDataFrame]) -> xr.Dataset: """ # Drop columns if all values in that column are empty - default_values = ValidBboxesDataFrame.get_empty_values() + default_values = ValidBboxAnnotationsDataFrame.get_empty_values() list_empty_cols = [ col for col in default_values if all(df[col] == default_values[col]) ] diff --git a/ethology/io/annotations/save_bboxes.py b/ethology/io/annotations/save_bboxes.py index bf9e09ef..d79d6ed4 100644 --- a/ethology/io/annotations/save_bboxes.py +++ b/ethology/io/annotations/save_bboxes.py @@ -12,16 +12,15 @@ from pandera.typing.pandas import DataFrame from ethology.io.annotations.validate import ( - ValidBboxesDataFrameCOCO, - ValidBboxesDataset, + ValidBboxAnnotationsCOCO, + ValidBboxAnnotationsDataset, ValidCOCO, - _check_input, - _check_output, ) +from ethology.io.validate import _check_input, _check_output -@_check_input(validator=ValidBboxesDataset) -@_check_output(validator=ValidCOCO) # check output is ethology importable +@_check_input(validator=ValidBboxAnnotationsDataset) +@_check_output(validator=ValidCOCO) # check output is ethology-importable def to_COCO_file(dataset: xr.Dataset, output_filepath: str | Path): """Save an ``ethology`` bounding box annotations dataset to a COCO file. @@ -56,11 +55,11 @@ def to_COCO_file(dataset: xr.Dataset, output_filepath: str | Path): return output_filepath -@_check_input(validator=ValidBboxesDataset) +@_check_input(validator=ValidBboxAnnotationsDataset) @pa.check_types def _to_COCO_exportable_df( ds: xr.Dataset, -) -> DataFrame[ValidBboxesDataFrameCOCO]: +) -> DataFrame[ValidBboxAnnotationsCOCO]: """Convert dataset of bounding boxes annotations to a COCO-exportable df. The returned dataframe is validated using ValidBBoxesDataFrameCOCO. @@ -98,7 +97,7 @@ def _to_COCO_exportable_df( return df[cols_to_select] -@_check_input(validator=ValidBboxesDataset) +@_check_input(validator=ValidBboxAnnotationsDataset) def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: """Get preliminary dataframe from a dataset of bounding boxes annotations. @@ -164,7 +163,7 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: @pa.check_types def _add_COCO_data_to_df( df: pd.DataFrame, ds_attrs: dict -) -> DataFrame[ValidBboxesDataFrameCOCO]: +) -> DataFrame[ValidBboxAnnotationsCOCO]: """Add COCO-required data to preliminary dataframe. The input dataframe is obtained from a dataset of bounding boxes @@ -266,7 +265,9 @@ def _add_COCO_data_to_df( @pa.check_types -def _create_COCO_dict(df: DataFrame[ValidBboxesDataFrameCOCO]) -> dict: +def _create_COCO_dict( + df: DataFrame[ValidBboxAnnotationsCOCO], +) -> dict: """Extract COCO dictionary from a COCO-exportable dataframe. Parameters @@ -282,7 +283,7 @@ def _create_COCO_dict(df: DataFrame[ValidBboxesDataFrameCOCO]) -> dict: """ COCO_dict: dict[str, Any] = {} map_columns_to_COCO_fields = ( - ValidBboxesDataFrameCOCO.map_df_columns_to_COCO_fields() + ValidBboxAnnotationsCOCO.map_df_columns_to_COCO_fields() ) for sections in ["images", "categories", "annotations"]: # Extract and rename required columns for this section diff --git a/ethology/io/annotations/validate.py b/ethology/io/annotations/validate.py index 04b81a60..35a82550 100644 --- a/ethology/io/annotations/validate.py +++ b/ethology/io/annotations/validate.py @@ -267,7 +267,7 @@ class ValidBboxAnnotationsDataset(ValidDataset): ) -class ValidBboxesDataFrame(pa.DataFrameModel): +class ValidBboxAnnotationsDataFrame(pa.DataFrameModel): """Class for valid bounding boxes intermediate dataframes. We use this dataframe internally as an intermediate step in the process of @@ -396,7 +396,7 @@ def get_empty_values() -> dict: } -class ValidBboxesDataFrameCOCO(pa.DataFrameModel): +class ValidBboxAnnotationsCOCO(pa.DataFrameModel): """Class for COCO-exportable bounding box annotations dataframes. The validation checks the required columns exist and their types are diff --git a/tests/test_unit/test_io_annotations/test_save_bboxes.py b/tests/test_unit/test_io_annotations/test_save_bboxes.py index fb32e978..1cab673c 100644 --- a/tests/test_unit/test_io_annotations/test_save_bboxes.py +++ b/tests/test_unit/test_io_annotations/test_save_bboxes.py @@ -16,7 +16,7 @@ _get_raw_df_from_ds, to_COCO_file, ) -from ethology.io.annotations.validate import ValidBboxesDataFrameCOCO +from ethology.io.annotations.validate import ValidBboxAnnotationsCOCO def read_JSON_as_dict(file_path: str | Path) -> dict: @@ -146,7 +146,7 @@ def _sample_bboxes_df_drop( ).set_index("annotation_id", drop=False) # Validate as COCO-exportable - df = ValidBboxesDataFrameCOCO.validate(df) + df = ValidBboxAnnotationsCOCO.validate(df) # Drop columns if specified if columns_to_drop: @@ -216,7 +216,7 @@ def test_validate_bboxes_df_COCO( df_factory = request.getfixturevalue(df) df = df_factory() with expected_exception as excinfo: - ValidBboxesDataFrameCOCO(df) + ValidBboxAnnotationsCOCO(df) if excinfo: assert expected_error_message in str(excinfo.value) @@ -366,7 +366,7 @@ def test_create_COCO_dict(sample_bboxes_df: Callable): # Check keys in each section map_df_columns_to_coco = copy.deepcopy( - ValidBboxesDataFrameCOCO.map_df_columns_to_COCO_fields() + ValidBboxAnnotationsCOCO.map_df_columns_to_COCO_fields() ) for section, section_mapping in map_df_columns_to_coco.items(): assert all( diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_io_annotations/test_validators.py index d054da27..84de5793 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_io_annotations/test_validators.py @@ -11,7 +11,7 @@ _extract_properties_keys, ) from ethology.io.annotations.validate import ( - ValidBboxesDataset, + ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, ) @@ -557,7 +557,7 @@ def test_valid_bboxes_dataset_validation( expected_error_message: str, request: pytest.FixtureRequest, ): - """Test ValidBboxesDataset validation with various input scenarios.""" + """Test bbox annotations dataset validation in various input scenarios.""" # Get dataset to validate if isinstance(sample_dataset, str): dataset = request.getfixturevalue(sample_dataset) @@ -566,7 +566,7 @@ def test_valid_bboxes_dataset_validation( # Run validation and check exception with expected_exception as excinfo: - validator = ValidBboxesDataset(dataset=dataset) + validator = ValidBboxAnnotationsDataset(dataset=dataset) if excinfo: error_msg = str(excinfo.value) From 9c68fa4f5a05b55934550938bdeed1f7c277c482 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:28:29 +0000 Subject: [PATCH 03/13] Rename fixtures and test to highlight annotations vs detection datasets. Add test for bbox detections dataset validation --- tests/fixtures/annotations.py | 19 +- tests/test_unit/test_datasets/test_split.py | 48 ++--- .../test_io_annotations/test_validators.py | 20 +- .../test_io_detections/test_validators.py | 183 ++++++++++++++++++ 4 files changed, 229 insertions(+), 41 deletions(-) create mode 100644 tests/test_unit/test_io_detections/test_validators.py diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py index 5fd95a8f..fd8efb35 100644 --- a/tests/fixtures/annotations.py +++ b/tests/fixtures/annotations.py @@ -135,8 +135,8 @@ def default_COCO_schema() -> dict: # ----------------- Bboxes dataset validation fixtures ----------------- @pytest.fixture -def valid_bboxes_dataset(): - """Create a valid xarray dataset for bboxes validation.""" +def valid_bbox_annotations_dataset(): + """Create a valid bbox annotations dataset for validation.""" image_ids = [1, 2, 3] annotation_ids = [0, 1, 2] # three per frame space_dims = ["x", "y"] @@ -145,13 +145,13 @@ def valid_bboxes_dataset(): position_data = np.zeros( (len(image_ids), len(space_dims), len(annotation_ids)) ) - shape_data = np.zeros((len(image_ids), len(annotation_ids))) + shape_data = np.copy(position_data) # Create the dataset ds = xr.Dataset( data_vars={ "position": (["image_id", "space", "id"], position_data), - "shape": (["image_id", "id"], shape_data), + "shape": (["image_id", "space", "id"], shape_data), }, coords={ "image_id": image_ids, @@ -164,10 +164,15 @@ def valid_bboxes_dataset(): @pytest.fixture -def valid_bboxes_dataset_extra_vars_and_dims( - valid_bboxes_dataset: xr.Dataset, +def valid_bbox_annotations_dataset_extra_vars_and_dims( + valid_bbox_annotations_dataset: xr.Dataset, ) -> xr.Dataset: - ds = valid_bboxes_dataset.copy(deep=True) + """Create a valid bbox annotations dataset for validation. + + The dataset is valid but contains more variables and dimensions than + the minimum required for a bbox annotations dataset. + """ + ds = valid_bbox_annotations_dataset.copy(deep=True) ds.coords["extra_dim"] = [10, 20, 30] ds["extra_var_1"] = (["image_id"], np.random.rand(len(ds.image_id))) ds["extra_var_2"] = (["id"], np.random.rand(len(ds.id))) diff --git a/tests/test_unit/test_datasets/test_split.py b/tests/test_unit/test_datasets/test_split.py index c9846262..539b448b 100644 --- a/tests/test_unit/test_datasets/test_split.py +++ b/tests/test_unit/test_datasets/test_split.py @@ -23,12 +23,12 @@ def split_at_any_delimiter(text: str, delimiters: list[str]) -> list[str]: @pytest.fixture -def valid_bboxes_dataset_to_split_1(valid_bboxes_dataset): +def valid_bbox_annotations_ds_to_split_1(valid_bbox_annotations_dataset): # We add a `foo` variable to the dataset that is # one-dimensional along the `image_id` dimension to # use for grouping by. # Note: len(valid_bboxes_dataset.image_id) = 3 - ds = valid_bboxes_dataset.copy(deep=True) + ds = valid_bbox_annotations_dataset.copy(deep=True) ds["foo"] = ( ["image_id"], np.array([0, 1, 1]), @@ -37,14 +37,14 @@ def valid_bboxes_dataset_to_split_1(valid_bboxes_dataset): @pytest.fixture -def valid_bboxes_dataset_to_split_2(valid_bboxes_dataset): +def valid_bbox_annotations_ds_to_split_2(valid_bbox_annotations_dataset): # We add a `foo` variable to the dataset that is # one-dimensional along the `image_id` dimension to # use for grouping by. In this case we ensure we have # 3 groups to be able to split using 3 folds (with # GroupKFold we cannot have more folds than groups). # Note: len(valid_bboxes_dataset.image_id) = 3 - ds = valid_bboxes_dataset.copy(deep=True) + ds = valid_bbox_annotations_dataset.copy(deep=True) ds["foo"] = ( ["image_id"], np.array([0, 1, 2]), @@ -150,12 +150,12 @@ def test_approximate_subset_sum(inputs, expected_subset_dict): "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order @@ -207,12 +207,12 @@ def test_split_dataset_group_by_apss(inputs, request): "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_2", + "dataset": "valid_bbox_annotations_ds_to_split_2", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_2", + "dataset": "valid_bbox_annotations_ds_to_split_2", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order @@ -258,10 +258,10 @@ def test_split_dataset_group_by_kfold(inputs, request): ) -def test_split_dataset_group_by_kfold_seed(valid_bboxes_dataset_to_split_2): +def test_split_dataset_group_by_kfold_seed(valid_bbox_annotations_ds_to_split_2): """Test the behaviour of the seed when using the `kfold` method.""" # prepare inputs - dataset = valid_bboxes_dataset_to_split_2 + dataset = valid_bbox_annotations_ds_to_split_2 list_fractions = [0.334, 0.666] samples_coordinate = "image_id" group_by_var = "foo" @@ -313,7 +313,7 @@ def test_split_dataset_group_by_kfold_seed(valid_bboxes_dataset_to_split_2): ], ) def test_split_dataset_group_by( - method, function_to_mock, valid_bboxes_dataset_to_split_1 + method, function_to_mock, valid_bbox_annotations_ds_to_split_1 ): """Test the wrapper function dispatches to the appropriate method.""" # Create mock return datasets @@ -322,7 +322,7 @@ def test_split_dataset_group_by( # Patch the internal function and call the wrapper with patch(function_to_mock, return_value=mock_return_value) as mock: _ds_subset_1, _ds_subset_2 = split_dataset_group_by( - dataset=valid_bboxes_dataset_to_split_1, + dataset=valid_bbox_annotations_ds_to_split_1, group_by_var="foo", list_fractions=[0.334, 0.666], samples_coordinate="image_id", @@ -336,8 +336,8 @@ def test_split_dataset_group_by( @pytest.mark.parametrize( "dataset, expected_method", [ - ("valid_bboxes_dataset_to_split_1", "apss"), - ("valid_bboxes_dataset_to_split_2", "kfold"), + ("valid_bbox_annotations_ds_to_split_1", "apss"), + ("valid_bbox_annotations_ds_to_split_2", "kfold"), ], ) def test_split_dataset_group_by_auto(dataset, expected_method, request): @@ -365,12 +365,12 @@ def test_split_dataset_group_by_auto(dataset, expected_method, request): def test_split_dataset_group_by_unknown_method( - valid_bboxes_dataset_to_split_1, + valid_bbox_annotations_ds_to_split_1, ): """Test that an unknown method raises a ValueError.""" with pytest.raises(ValueError, match="Unknown method"): split_dataset_group_by( - dataset=valid_bboxes_dataset_to_split_1, + dataset=valid_bbox_annotations_ds_to_split_1, group_by_var="foo", list_fractions=[0.5, 0.5], method="unknown_method", @@ -381,17 +381,17 @@ def test_split_dataset_group_by_unknown_method( "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [1 / 3, 1 / 3, 1 / 3], "samples_coordinate": "image_id", }, # more than two fractions @@ -444,26 +444,26 @@ def test_split_dataset_random(inputs, request): [ ( "auto", - "valid_bboxes_dataset_to_split_1", + "valid_bbox_annotations_ds_to_split_1", # dataset that will trigger auto-selection of apss # with the requested fractions 0.334 and 0.666 "Auto-selected approximate subset-sum method", ), ( "auto", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so kfold method can be used "Using group k-fold method with", ), ( "kfold", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so kfold method can be used "Using group k-fold method with", ), ( "apss", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so apss method can be used "Using approximate subset-sum method with", ), @@ -639,7 +639,7 @@ def test_split_dataset_warning_empty_subset( ): """Test that a warning is thrown when at least one subset is empty.""" # Get dataset to split - ds = request.getfixturevalue("valid_bboxes_dataset_to_split_1") + ds = request.getfixturevalue("valid_bbox_annotations_ds_to_split_1") inputs["dataset"] = ds # We use fractions that will cause an empty subset diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_io_annotations/test_validators.py index 84de5793..823ea003 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_io_annotations/test_validators.py @@ -460,12 +460,12 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): "sample_dataset, expected_exception, expected_error_message", [ ( - "valid_bboxes_dataset", + "valid_bbox_annotations_dataset", does_not_raise(), "", ), ( - "valid_bboxes_dataset_extra_vars_and_dims", + "valid_bbox_annotations_dataset_extra_vars_and_dims", does_not_raise(), "", ), @@ -542,16 +542,16 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ), ], ids=[ - "valid_bboxes_dataset", - "valid_bboxes_dataset_extra_vars_and_dims", - "invalid_bboxes_dataset_type", - "invalid_bboxes_dataset_missing_data_var", - "invalid_bboxes_dataset_missing_multiple_data_vars", - "invalid_bboxes_dataset_missing_dimension", - "invalid_bboxes_dataset_missing_multiple_dimensions", + "valid_bbox_annotations_dataset", + "valid_bbox_annotations_dataset_extra_vars_and_dims", + "invalid_bbox_annotations_dataset_type", + "invalid_bbox_annotations_dataset_missing_data_var", + "invalid_bbox_annotations_dataset_missing_multiple_data_vars", + "invalid_bbox_annotations_dataset_missing_dimension", + "invalid_bbox_annotations_dataset_missing_multiple_dimensions", ], ) -def test_valid_bboxes_dataset_validation( +def test_validator_bbox_annotations_dataset( sample_dataset: str | dict, expected_exception: pytest.raises, expected_error_message: str, diff --git a/tests/test_unit/test_io_detections/test_validators.py b/tests/test_unit/test_io_detections/test_validators.py new file mode 100644 index 00000000..2878696a --- /dev/null +++ b/tests/test_unit/test_io_detections/test_validators.py @@ -0,0 +1,183 @@ +from contextlib import nullcontext as does_not_raise + +import numpy as np +import pytest +import xarray as xr + +from ethology.io.detections.validate import ValidBboxDetectionsDataset + + +@pytest.fixture +def valid_bbox_detections_dataset(): + """Create a valid bbox detections dataset for validation.""" + image_ids = [1, 2, 3] + annotation_ids = [0, 1, 2] # max 3 bboxes per frame + space_dims = ["x", "y"] + + # Create position, shape and confidence data all zeros + position_data = np.zeros( + (len(image_ids), len(space_dims), len(annotation_ids)) + ) + shape_data = np.copy(position_data) + confidence_data = np.zeros((len(image_ids), len(annotation_ids))) + + # Create the dataset + ds = xr.Dataset( + data_vars={ + "position": (["image_id", "space", "id"], position_data), + "shape": (["image_id", "space", "id"], shape_data), + "confidence": (["image_id", "id"], confidence_data), + }, + coords={ + "image_id": image_ids, + "space": ["x", "y"], + "id": annotation_ids, + }, + ) + + return ds + + +@pytest.fixture +def valid_bbox_detections_dataset_extra_vars_and_dims( + valid_bbox_detections_dataset: xr.Dataset, +) -> xr.Dataset: + ds = valid_bbox_detections_dataset.copy(deep=True) + ds.coords["extra_dim"] = [10, 20, 30] + ds["extra_var_1"] = (["image_id"], np.random.rand(len(ds.image_id))) + ds["extra_var_2"] = (["id"], np.random.rand(len(ds.id))) + return ds + + +@pytest.mark.parametrize( + "sample_dataset, expected_exception, expected_error_message", + [ + ( + "valid_bbox_detections_dataset", + does_not_raise(), + "", + ), + ( + "valid_bbox_detections_dataset_extra_vars_and_dims", + does_not_raise(), + "", + ), + ( + {"position": [1, 2, 3], "shape": [4, 5, 6]}, + pytest.raises(TypeError), + "Expected an xarray Dataset, but got .", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required data variables: ['confidence']", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required data variables: ['confidence', 'shape']", + ), + ( + xr.Dataset( + coords={"image_id": np.arange(3), "id": np.arange(2)}, + data_vars={ + "position": (["image_id", "id"], np.zeros((3, 2))), + "shape": (["image_id", "id"], np.zeros((3, 2))), + "confidence": (["image_id", "id"], np.zeros((3, 2))), + }, + ), + pytest.raises(ValueError), + "Missing required dimensions: ['space']", + ), + ( + xr.Dataset( + coords={ + "foo": np.arange(3), + "bar": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["foo", "bar", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["foo", "bar", "id"], + np.zeros((3, 2, 2)), + ), + "confidence": ( + ["foo", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required dimensions: ['image_id', 'space']", + ), + ], + ids=[ + "valid_bbox_detections_dataset", + "valid_bbox_detections_dataset_extra_vars_and_dims", + "invalid_bbox_detections_dataset_type", + "invalid_bbox_detections_dataset_missing_data_var", + "invalid_bbox_detections_missing_multiple_data_vars", + "invalid_bbox_detections_missing_dimension", + "invalid_bbox_detections_missing_multiple_dimensions", + ], +) +def test_validator_bbox_detections_dataset( + sample_dataset: str | dict, + expected_exception: pytest.raises, + expected_error_message: str, + request: pytest.FixtureRequest, +): + """Test bbox annotations dataset validation in various input scenarios.""" + # Get dataset to validate + if isinstance(sample_dataset, str): + dataset = request.getfixturevalue(sample_dataset) + else: + dataset = sample_dataset + + # Run validation and check exception + with expected_exception as excinfo: + validator = ValidBboxDetectionsDataset(dataset=dataset) + + if excinfo: + error_msg = str(excinfo.value) + assert error_msg in expected_error_message + else: + assert validator.dataset is dataset + assert validator.required_dims == {"image_id", "space", "id"} + assert validator.required_data_vars == { + "confidence", + "position", + "shape", + } From 78eadb93e68fc288acc0784739dbe95d0a981364 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:29:06 +0000 Subject: [PATCH 04/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_unit/test_datasets/test_split.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_unit/test_datasets/test_split.py b/tests/test_unit/test_datasets/test_split.py index 539b448b..23674cdf 100644 --- a/tests/test_unit/test_datasets/test_split.py +++ b/tests/test_unit/test_datasets/test_split.py @@ -258,7 +258,9 @@ def test_split_dataset_group_by_kfold(inputs, request): ) -def test_split_dataset_group_by_kfold_seed(valid_bbox_annotations_ds_to_split_2): +def test_split_dataset_group_by_kfold_seed( + valid_bbox_annotations_ds_to_split_2, +): """Test the behaviour of the seed when using the `kfold` method.""" # prepare inputs dataset = valid_bbox_annotations_ds_to_split_2 From ccc9ca945ce8c76fa72d456a97b6dc8c651bf02d Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:15:03 +0000 Subject: [PATCH 05/13] Extend validator to check minimum dimensions per data variable too. Extend tests --- ethology/io/annotations/validate.py | 11 +-- ethology/io/detections/validate.py | 12 ++-- ethology/io/validate.py | 20 +++++- .../test_io_annotations/test_validators.py | 66 +++++++++++++++--- .../test_io_detections/test_validators.py | 67 +++++++++++++++++-- 5 files changed, 152 insertions(+), 24 deletions(-) diff --git a/ethology/io/annotations/validate.py b/ethology/io/annotations/validate.py index 35a82550..87229879 100644 --- a/ethology/io/annotations/validate.py +++ b/ethology/io/annotations/validate.py @@ -239,8 +239,8 @@ class ValidBboxAnnotationsDataset(ValidDataset): The xarray dataset to validate. required_dims : set Set of required dimension names. - required_data_vars : set - Set of required data variable names. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required dimensions. Raises ------ @@ -261,8 +261,11 @@ class ValidBboxAnnotationsDataset(ValidDataset): default={"image_id", "space", "id"}, init=False, ) - required_data_vars: set = field( - default={"position", "shape"}, + required_data_vars: dict = field( + default={ + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + }, init=False, ) diff --git a/ethology/io/detections/validate.py b/ethology/io/detections/validate.py index 7ef6285d..ceff454b 100644 --- a/ethology/io/detections/validate.py +++ b/ethology/io/detections/validate.py @@ -20,8 +20,8 @@ class ValidBboxDetectionsDataset(ValidDataset): The xarray dataset to validate. required_dims : set Set of required dimension names. - required_data_vars : set - Set of required data variable names. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required dimensions. Raises ------ @@ -42,7 +42,11 @@ class ValidBboxDetectionsDataset(ValidDataset): default={"image_id", "space", "id"}, init=False, ) - required_data_vars: set = field( - default={"position", "shape", "confidence"}, + required_data_vars: dict = field( + default={ + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + "confidence": {"image_id", "id"}, + }, init=False, ) diff --git a/ethology/io/validate.py b/ethology/io/validate.py index 22c215f9..7c981dce 100644 --- a/ethology/io/validate.py +++ b/ethology/io/validate.py @@ -54,7 +54,7 @@ def required_dims(self) -> set: @property @abstractmethod - def required_data_vars(self) -> set: + def required_data_vars(self) -> dict[str, set]: """Subclasses must provide a required_data_vars property.""" pass @@ -70,7 +70,7 @@ def _check_dataset_type(self, attribute, value): @dataset.validator def _check_required_data_variables(self, attribute, value): """Ensure the dataset has all required data variables.""" - missing_vars = self.required_data_vars - set(value.data_vars) + missing_vars = self.required_data_vars.keys() - set(value.data_vars) if missing_vars: raise ValueError( f"Missing required data variables: {sorted(missing_vars)}" @@ -85,6 +85,22 @@ def _check_required_dimensions(self, attribute, value): f"Missing required dimensions: {sorted(missing_dims)}" ) + @dataset.validator + def _check_dimensions_per_data_variable(self, attribute, value): + """Ensure the dataset has all required dimensions.""" + for ( + data_var, + required_dims_in_data_var, + ) in self.required_data_vars.items(): + missing_dims = required_dims_in_data_var - set( + value.data_vars[data_var].coords + ) + if missing_dims: + raise ValueError( + f"Missing required dimensions ({sorted(missing_dims)}) " + f"in data variable '{data_var}'." + ) + def _check_output(validator: type): """Return a decorator that validates the output of a function.""" diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_io_annotations/test_validators.py index 823ea003..553514ab 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_io_annotations/test_validators.py @@ -469,6 +469,27 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): does_not_raise(), "", ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id", "foo"], + np.zeros((3, 2, 2, 1)), + ), + }, + ), + does_not_raise(), + "", + ), ( {"position": [1, 2, 3], "shape": [4, 5, 6]}, pytest.raises(TypeError), @@ -540,15 +561,41 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): pytest.raises(ValueError), "Missing required dimensions: ['image_id', 'space']", ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + ( + "Missing required dimensions (['space']) " + "in data variable 'shape'." + ), + ), ], ids=[ - "valid_bbox_annotations_dataset", - "valid_bbox_annotations_dataset_extra_vars_and_dims", - "invalid_bbox_annotations_dataset_type", - "invalid_bbox_annotations_dataset_missing_data_var", - "invalid_bbox_annotations_dataset_missing_multiple_data_vars", - "invalid_bbox_annotations_dataset_missing_dimension", - "invalid_bbox_annotations_dataset_missing_multiple_dimensions", + "valid_bbox_annotations", + "valid_bbox_annotations_extra_vars_and_dims", + "valid_bbox_detections_extra_dims_in_shape_var", + "invalid_bbox_annotations_type", + "invalid_bbox_annotations_missing_data_var", + "invalid_bbox_annotations_missing_multiple_data_vars", + "invalid_bbox_annotations_missing_dimension", + "invalid_bbox_annotations_missing_multiple_dimensions", + "invalid_bbox_annotations_missing_dimension_in_data_var", ], ) def test_validator_bbox_annotations_dataset( @@ -574,4 +621,7 @@ def test_validator_bbox_annotations_dataset( else: assert validator.dataset is dataset assert validator.required_dims == {"image_id", "space", "id"} - assert validator.required_data_vars == {"position", "shape"} + assert validator.required_data_vars == { + "position": {"id", "image_id", "space"}, + "shape": {"id", "image_id", "space"}, + } diff --git a/tests/test_unit/test_io_detections/test_validators.py b/tests/test_unit/test_io_detections/test_validators.py index 2878696a..b26834aa 100644 --- a/tests/test_unit/test_io_detections/test_validators.py +++ b/tests/test_unit/test_io_detections/test_validators.py @@ -62,6 +62,31 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( does_not_raise(), "", ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id", "foo"], + np.zeros((3, 2, 2, 1)), + ), + "confidence": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + does_not_raise(), + "", + ), ( {"position": [1, 2, 3], "shape": [4, 5, 6]}, pytest.raises(TypeError), @@ -142,15 +167,45 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( pytest.raises(ValueError), "Missing required dimensions: ['image_id', 'space']", ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + "confidence": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + ( + "Missing required dimensions (['space']) " + "in data variable 'shape'." + ), + ), ], ids=[ - "valid_bbox_detections_dataset", - "valid_bbox_detections_dataset_extra_vars_and_dims", - "invalid_bbox_detections_dataset_type", + "valid_bbox_detections", + "valid_bbox_detections_extra_vars_and_dims", + "valid_bbox_detections_extra_dims_in_shape_var", + "invalid_bbox_detections_type", "invalid_bbox_detections_dataset_missing_data_var", "invalid_bbox_detections_missing_multiple_data_vars", "invalid_bbox_detections_missing_dimension", "invalid_bbox_detections_missing_multiple_dimensions", + "invalid_bbox_detections_missing_dimension_in_data_var", ], ) def test_validator_bbox_detections_dataset( @@ -177,7 +232,7 @@ def test_validator_bbox_detections_dataset( assert validator.dataset is dataset assert validator.required_dims == {"image_id", "space", "id"} assert validator.required_data_vars == { - "confidence", - "position", - "shape", + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + "confidence": {"image_id", "id"}, } From 575c56b434fae05a811d5a26af3d796bc4ef740f Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:27:45 +0000 Subject: [PATCH 06/13] Improve error message --- ethology/io/validate.py | 20 +++++++++++-------- .../test_io_annotations/test_validators.py | 4 ++-- .../test_io_detections/test_validators.py | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/ethology/io/validate.py b/ethology/io/validate.py index 7c981dce..d82d2378 100644 --- a/ethology/io/validate.py +++ b/ethology/io/validate.py @@ -88,19 +88,23 @@ def _check_required_dimensions(self, attribute, value): @dataset.validator def _check_dimensions_per_data_variable(self, attribute, value): """Ensure the dataset has all required dimensions.""" - for ( - data_var, - required_dims_in_data_var, - ) in self.required_data_vars.items(): - missing_dims = required_dims_in_data_var - set( + error_messages = [] + for data_var, dims_per_data_var in self.required_data_vars.items(): + missing_dims = dims_per_data_var - set( value.data_vars[data_var].coords ) if missing_dims: - raise ValueError( - f"Missing required dimensions ({sorted(missing_dims)}) " - f"in data variable '{data_var}'." + error_messages.append( + f"data variable '{data_var}' is missing " + f"dimensions {sorted(missing_dims)}" ) + if error_messages: + raise ValueError( + "Some data variables are missing required dimensions:\n - " + + "\n - ".join(error_messages) + ) + def _check_output(validator: type): """Return a decorator that validates the output of a function.""" diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_io_annotations/test_validators.py index 553514ab..599f942a 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_io_annotations/test_validators.py @@ -581,8 +581,8 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): ), pytest.raises(ValueError), ( - "Missing required dimensions (['space']) " - "in data variable 'shape'." + "Some data variables are missing required dimensions:" + "\n - data variable 'shape' is missing dimensions ['space']" ), ), ], diff --git a/tests/test_unit/test_io_detections/test_validators.py b/tests/test_unit/test_io_detections/test_validators.py index b26834aa..aa4abf9b 100644 --- a/tests/test_unit/test_io_detections/test_validators.py +++ b/tests/test_unit/test_io_detections/test_validators.py @@ -191,8 +191,8 @@ def valid_bbox_detections_dataset_extra_vars_and_dims( ), pytest.raises(ValueError), ( - "Missing required dimensions (['space']) " - "in data variable 'shape'." + "Some data variables are missing required dimensions:" + "\n - data variable 'shape' is missing dimensions ['space']" ), ), ], From bbb9c5b00d0a372460658bed313307a277001289 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:30:13 +0000 Subject: [PATCH 07/13] Ignore code cov in abstract base class --- ethology/io/validate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ethology/io/validate.py b/ethology/io/validate.py index d82d2378..4739f32d 100644 --- a/ethology/io/validate.py +++ b/ethology/io/validate.py @@ -50,13 +50,13 @@ class ValidDataset(ABC): @abstractmethod def required_dims(self) -> set: """Subclasses must provide a required_dims property.""" - pass + pass # pragma: no cover @property @abstractmethod def required_data_vars(self) -> dict[str, set]: """Subclasses must provide a required_data_vars property.""" - pass + pass # pragma: no cover # Validators @dataset.validator From d0f8991fef7efbbd2dac6b4322a06e427bfe68dd Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:55:19 +0000 Subject: [PATCH 08/13] Factor out validators module --- ethology/io/annotations/load_bboxes.py | 4 +- ethology/io/annotations/save_bboxes.py | 4 +- .../validate.py => validators/annotations.py} | 4 +- .../validate.py => validators/detections.py} | 2 +- .../json_schemas/__init__.py | 0 .../json_schemas/schemas/COCO_schema.json | 0 .../json_schemas/schemas/README.md | 0 .../json_schemas/schemas/VIA_schema.json | 0 .../json_schemas/utils.py | 0 .../{io/validate.py => validators/utils.py} | 0 tests/fixtures/annotations.py | 16 -- .../test_io_annotations/test_save_bboxes.py | 2 +- .../test_annotations.py} | 249 +--------------- .../test_detections.py} | 2 +- .../test_validators/test_json_schemas.py | 268 ++++++++++++++++++ 15 files changed, 278 insertions(+), 273 deletions(-) rename ethology/{io/annotations/validate.py => validators/annotations.py} (99%) rename ethology/{io/detections/validate.py => validators/detections.py} (96%) rename ethology/{io/annotations => validators}/json_schemas/__init__.py (100%) rename ethology/{io/annotations => validators}/json_schemas/schemas/COCO_schema.json (100%) rename ethology/{io/annotations => validators}/json_schemas/schemas/README.md (100%) rename ethology/{io/annotations => validators}/json_schemas/schemas/VIA_schema.json (100%) rename ethology/{io/annotations => validators}/json_schemas/utils.py (100%) rename ethology/{io/validate.py => validators/utils.py} (100%) rename tests/test_unit/{test_io_annotations/test_validators.py => test_validators/test_annotations.py} (58%) rename tests/test_unit/{test_io_detections/test_validators.py => test_validators/test_detections.py} (99%) create mode 100644 tests/test_unit/test_validators/test_json_schemas.py diff --git a/ethology/io/annotations/load_bboxes.py b/ethology/io/annotations/load_bboxes.py index f70de2d4..47f01fe4 100644 --- a/ethology/io/annotations/load_bboxes.py +++ b/ethology/io/annotations/load_bboxes.py @@ -10,13 +10,13 @@ import xarray as xr from pandera.typing.pandas import DataFrame -from ethology.io.annotations.validate import ( +from ethology.validators.annotations import ( ValidBboxAnnotationsDataFrame, ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, ) -from ethology.io.validate import _check_output +from ethology.validators.utils import _check_output @_check_output(ValidBboxAnnotationsDataset) diff --git a/ethology/io/annotations/save_bboxes.py b/ethology/io/annotations/save_bboxes.py index d79d6ed4..a21ecf93 100644 --- a/ethology/io/annotations/save_bboxes.py +++ b/ethology/io/annotations/save_bboxes.py @@ -11,12 +11,12 @@ import xarray as xr from pandera.typing.pandas import DataFrame -from ethology.io.annotations.validate import ( +from ethology.validators.annotations import ( ValidBboxAnnotationsCOCO, ValidBboxAnnotationsDataset, ValidCOCO, ) -from ethology.io.validate import _check_input, _check_output +from ethology.validators.utils import _check_input, _check_output @_check_input(validator=ValidBboxAnnotationsDataset) diff --git a/ethology/io/annotations/validate.py b/ethology/validators/annotations.py similarity index 99% rename from ethology/io/annotations/validate.py rename to ethology/validators/annotations.py index 87229879..84653110 100644 --- a/ethology/io/annotations/validate.py +++ b/ethology/validators/annotations.py @@ -8,13 +8,13 @@ from attrs import define, field from pandera.typing import Index -from ethology.io.annotations.json_schemas.utils import ( +from ethology.validators.json_schemas.utils import ( _check_file_is_json, _check_file_matches_schema, _check_required_keys_in_dict, _get_default_schema, ) -from ethology.io.validate import ValidDataset +from ethology.validators.utils import ValidDataset @define diff --git a/ethology/io/detections/validate.py b/ethology/validators/detections.py similarity index 96% rename from ethology/io/detections/validate.py rename to ethology/validators/detections.py index ceff454b..62bb609e 100644 --- a/ethology/io/detections/validate.py +++ b/ethology/validators/detections.py @@ -2,7 +2,7 @@ from attrs import define, field -from ethology.io.validate import ValidDataset +from ethology.validators.utils import ValidDataset @define diff --git a/ethology/io/annotations/json_schemas/__init__.py b/ethology/validators/json_schemas/__init__.py similarity index 100% rename from ethology/io/annotations/json_schemas/__init__.py rename to ethology/validators/json_schemas/__init__.py diff --git a/ethology/io/annotations/json_schemas/schemas/COCO_schema.json b/ethology/validators/json_schemas/schemas/COCO_schema.json similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/COCO_schema.json rename to ethology/validators/json_schemas/schemas/COCO_schema.json diff --git a/ethology/io/annotations/json_schemas/schemas/README.md b/ethology/validators/json_schemas/schemas/README.md similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/README.md rename to ethology/validators/json_schemas/schemas/README.md diff --git a/ethology/io/annotations/json_schemas/schemas/VIA_schema.json b/ethology/validators/json_schemas/schemas/VIA_schema.json similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/VIA_schema.json rename to ethology/validators/json_schemas/schemas/VIA_schema.json diff --git a/ethology/io/annotations/json_schemas/utils.py b/ethology/validators/json_schemas/utils.py similarity index 100% rename from ethology/io/annotations/json_schemas/utils.py rename to ethology/validators/json_schemas/utils.py diff --git a/ethology/io/validate.py b/ethology/validators/utils.py similarity index 100% rename from ethology/io/validate.py rename to ethology/validators/utils.py diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py index fd8efb35..eb3830fa 100644 --- a/tests/fixtures/annotations.py +++ b/tests/fixtures/annotations.py @@ -117,22 +117,6 @@ def small_schema() -> dict: } -@pytest.fixture() -def default_VIA_schema() -> dict: - """Get default VIA schema.""" - from ethology.io.annotations.json_schemas.utils import _get_default_schema - - return _get_default_schema("VIA") - - -@pytest.fixture() -def default_COCO_schema() -> dict: - """Get default COCO schema.""" - from ethology.io.annotations.json_schemas.utils import _get_default_schema - - return _get_default_schema("COCO") - - # ----------------- Bboxes dataset validation fixtures ----------------- @pytest.fixture def valid_bbox_annotations_dataset(): diff --git a/tests/test_unit/test_io_annotations/test_save_bboxes.py b/tests/test_unit/test_io_annotations/test_save_bboxes.py index 1cab673c..21a0c60f 100644 --- a/tests/test_unit/test_io_annotations/test_save_bboxes.py +++ b/tests/test_unit/test_io_annotations/test_save_bboxes.py @@ -16,7 +16,7 @@ _get_raw_df_from_ds, to_COCO_file, ) -from ethology.io.annotations.validate import ValidBboxAnnotationsCOCO +from ethology.validators.annotations import ValidBboxAnnotationsCOCO def read_JSON_as_dict(file_path: str | Path) -> dict: diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_validators/test_annotations.py similarity index 58% rename from tests/test_unit/test_io_annotations/test_validators.py rename to tests/test_unit/test_validators/test_annotations.py index 599f942a..fbcc77ee 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_validators/test_annotations.py @@ -5,12 +5,7 @@ import pytest import xarray as xr -from ethology.io.annotations.json_schemas.utils import ( - _check_required_keys_in_dict, - _check_required_properties_keys, - _extract_properties_keys, -) -from ethology.io.annotations.validate import ( +from ethology.validators.annotations import ( ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, @@ -101,248 +96,6 @@ def test_validators_invalid_input_files( assert invalid_json_file.name in str(excinfo.value) -@pytest.mark.parametrize( - "schema, expected_properties_keys", - [ - ("small_schema", ["a", "b", "b/b1", "c", "c/c1", "c/c2"]), - ( - "default_VIA_schema", - [ - "_via_attributes", - "_via_attributes/file", - "_via_attributes/region", - "_via_attributes/region/default_options", - "_via_attributes/region/description", - "_via_attributes/region/options", - "_via_attributes/region/type", - "_via_data_format_version", - "_via_image_id_list", - "_via_img_metadata", - "_via_img_metadata/file_attributes", - "_via_img_metadata/filename", - "_via_img_metadata/regions", - "_via_img_metadata/regions/region_attributes", - "_via_img_metadata/regions/shape_attributes", - "_via_img_metadata/regions/shape_attributes/height", - "_via_img_metadata/regions/shape_attributes/name", - "_via_img_metadata/regions/shape_attributes/width", - "_via_img_metadata/regions/shape_attributes/x", - "_via_img_metadata/regions/shape_attributes/y", - "_via_img_metadata/size", - "_via_settings", - "_via_settings/core", - "_via_settings/project", - "_via_settings/ui", - ], - ), - ( - "default_COCO_schema", - [ - "annotations", - "annotations/area", - "annotations/bbox", - "annotations/category_id", - "annotations/id", - "annotations/image_id", - "annotations/iscrowd", - "categories", - "categories/id", - "categories/name", - "categories/supercategory", - "images", - "images/file_name", - "images/height", - "images/id", - "images/width", - "info", - "licenses", - ], - ), - ], -) -def test_extract_properties_keys( - schema: dict, - expected_properties_keys: list, - request: pytest.FixtureRequest, -): - """Test the _extract_properties_keys helper function.""" - schema = request.getfixturevalue(schema) - assert _extract_properties_keys(schema) == sorted(expected_properties_keys) - - -@pytest.mark.parametrize( - ( - "list_required_keys, input_dict, additional_message, " - "expected_exception, expected_message" - ), - [ - ( - ["images", "annotations", "categories"], - { - "images": [1, 2, 3], - "annotations": [1, 2, 3], - "categories": [1, 2, 3], - }, - "", - does_not_raise(), - "", - ), # zero missing keys, and all keys map to non-empty values - ( - ["images", "annotations", "categories"], - { - "images": [], - "annotations": [1, 2, 3], - "categories": [1, 2, 3], - }, - "", - pytest.raises(ValueError), - "Empty value(s) found for the required key(s) ['images'].", - ), # zero missing keys, but one ("images") maps to empty values - ( - ["images", "annotations", "categories"], - { - "images": [], - "annotations": {}, - "categories": [1, 2, 3], - }, - "", - pytest.raises(ValueError), - ( - "Empty value(s) found for the required key(s) " - "['annotations', 'images']." - ), - ), # zero missing keys, but two keys map to empty values - ( - ["images", "annotations", "categories"], - {"annotations": "", "categories": ""}, - "", - pytest.raises(ValueError), - "Required key(s) ['images'] not found.", - ), # one missing key - ( - ["images", "annotations", "categories"], - {"annotations": ""}, - "", - pytest.raises(ValueError), - "Required key(s) ['categories', 'images'] not found.", - ), # two missing keys - ( - ["images", "annotations", "categories"], - {"annotations": "", "categories": ""}, - "FOO", - pytest.raises(ValueError), - "Required key(s) ['images'] not foundFOO.", - ), # one missing key with additional message for missing keys - ], -) -def test_check_required_keys_in_dict( - list_required_keys: list, - input_dict: dict, - additional_message: str, - expected_exception: pytest.raises, - expected_message: str, -): - """Test the _check_required_keys_in_dict helper function. - - The check verifies that the required keys are defined in the input - dictionary and if they are, it checks that they do not map to empty - values. - """ - with expected_exception as excinfo: - _check_required_keys_in_dict( - list_required_keys, input_dict, additional_message - ) - - # Check error message - if excinfo: - assert expected_message in str(excinfo.value) - - -def test_check_required_properties_keys(small_schema: dict): - """Test the _check_required_keys helper function.""" - # Define a sample schema from "small_schema" - # with a "properties" key missing (e.g. "c/c2") - small_schema["properties"]["c"]["properties"].pop("c2") - - # Define required "properties" keys - required_keys = ["a", "b", "c/c2"] - - # Run check - with pytest.raises(ValueError) as excinfo: - _check_required_properties_keys(required_keys, small_schema) - - # Check error message - assert "Required key(s) ['c/c2'] not found in schema" in str(excinfo.value) - - -@pytest.mark.parametrize( - "input_file,", - [ - "VIA_JSON_sample_1.json", - "VIA_JSON_sample_2.json", - ], -) -def test_required_keys_in_provided_VIA_schema( - input_file: str, default_VIA_schema: dict, annotations_test_data: dict -): - """Check the provided VIA schema contains the ValidVIA required keys.""" - # Get required keys from a VIA valid file - filepath = annotations_test_data[input_file] - valid_VIA = ValidVIA(path=filepath) - required_VIA_keys = valid_VIA.required_keys - - # Map required keys to "properties" keys in schema - map_required_to_properties_keys = { - "main": "", - "images": "_via_img_metadata", - "regions": "_via_img_metadata/regions", - "shape_attributes": "_via_img_metadata/regions/shape_attributes", - } - - # Express required keys as required "properties" keys - required_property_keys = [ - val if ky == "main" else f"{map_required_to_properties_keys[ky]}/{val}" - for ky, values in required_VIA_keys.items() - for val in values - ] - - # Run check - _check_required_properties_keys( - required_property_keys, - default_VIA_schema, - ) - - -@pytest.mark.parametrize( - "input_file,", - [ - "COCO_JSON_sample_1.json", - "COCO_JSON_sample_2.json", - ], -) -def test_required_keys_in_provided_COCO_schema( - input_file: str, default_COCO_schema: dict, annotations_test_data: dict -): - """Check the provided COCO schema contains the ValidCOCO required keys.""" - # Get required keys from a COCO valid file - filepath = annotations_test_data[input_file] - valid_COCO = ValidCOCO(path=filepath) - required_COCO_keys = valid_COCO.required_keys - - # Prepare list of required "properties" keys with full paths - required_properties_keys = [ - f"{level}/{ky}" if level != "main" else ky - for level, required_keys in required_COCO_keys.items() - for ky in required_keys - ] - - # Run check - _check_required_properties_keys( - required_properties_keys, - default_COCO_schema, - ) - - @pytest.mark.parametrize( "validator, input_file, expected_exception", [ diff --git a/tests/test_unit/test_io_detections/test_validators.py b/tests/test_unit/test_validators/test_detections.py similarity index 99% rename from tests/test_unit/test_io_detections/test_validators.py rename to tests/test_unit/test_validators/test_detections.py index aa4abf9b..d053d6ef 100644 --- a/tests/test_unit/test_io_detections/test_validators.py +++ b/tests/test_unit/test_validators/test_detections.py @@ -4,7 +4,7 @@ import pytest import xarray as xr -from ethology.io.detections.validate import ValidBboxDetectionsDataset +from ethology.validators.detections import ValidBboxDetectionsDataset @pytest.fixture diff --git a/tests/test_unit/test_validators/test_json_schemas.py b/tests/test_unit/test_validators/test_json_schemas.py new file mode 100644 index 00000000..f496043f --- /dev/null +++ b/tests/test_unit/test_validators/test_json_schemas.py @@ -0,0 +1,268 @@ +from contextlib import nullcontext as does_not_raise + +import pytest + +from ethology.validators.annotations import ValidCOCO, ValidVIA +from ethology.validators.json_schemas.utils import ( + _check_required_keys_in_dict, + _check_required_properties_keys, + _extract_properties_keys, +) + + +@pytest.fixture() +def default_VIA_schema() -> dict: + """Get default VIA schema.""" + from ethology.validators.json_schemas.utils import _get_default_schema + + return _get_default_schema("VIA") + + +@pytest.fixture() +def default_COCO_schema() -> dict: + """Get default COCO schema.""" + from ethology.validators.json_schemas.utils import _get_default_schema + + return _get_default_schema("COCO") + + +@pytest.mark.parametrize( + "schema, expected_properties_keys", + [ + ("small_schema", ["a", "b", "b/b1", "c", "c/c1", "c/c2"]), + ( + "default_VIA_schema", + [ + "_via_attributes", + "_via_attributes/file", + "_via_attributes/region", + "_via_attributes/region/default_options", + "_via_attributes/region/description", + "_via_attributes/region/options", + "_via_attributes/region/type", + "_via_data_format_version", + "_via_image_id_list", + "_via_img_metadata", + "_via_img_metadata/file_attributes", + "_via_img_metadata/filename", + "_via_img_metadata/regions", + "_via_img_metadata/regions/region_attributes", + "_via_img_metadata/regions/shape_attributes", + "_via_img_metadata/regions/shape_attributes/height", + "_via_img_metadata/regions/shape_attributes/name", + "_via_img_metadata/regions/shape_attributes/width", + "_via_img_metadata/regions/shape_attributes/x", + "_via_img_metadata/regions/shape_attributes/y", + "_via_img_metadata/size", + "_via_settings", + "_via_settings/core", + "_via_settings/project", + "_via_settings/ui", + ], + ), + ( + "default_COCO_schema", + [ + "annotations", + "annotations/area", + "annotations/bbox", + "annotations/category_id", + "annotations/id", + "annotations/image_id", + "annotations/iscrowd", + "categories", + "categories/id", + "categories/name", + "categories/supercategory", + "images", + "images/file_name", + "images/height", + "images/id", + "images/width", + "info", + "licenses", + ], + ), + ], +) +def test_extract_properties_keys( + schema: dict, + expected_properties_keys: list, + request: pytest.FixtureRequest, +): + """Test the _extract_properties_keys helper function.""" + schema = request.getfixturevalue(schema) + assert _extract_properties_keys(schema) == sorted(expected_properties_keys) + + +@pytest.mark.parametrize( + ( + "list_required_keys, input_dict, additional_message, " + "expected_exception, expected_message" + ), + [ + ( + ["images", "annotations", "categories"], + { + "images": [1, 2, 3], + "annotations": [1, 2, 3], + "categories": [1, 2, 3], + }, + "", + does_not_raise(), + "", + ), # zero missing keys, and all keys map to non-empty values + ( + ["images", "annotations", "categories"], + { + "images": [], + "annotations": [1, 2, 3], + "categories": [1, 2, 3], + }, + "", + pytest.raises(ValueError), + "Empty value(s) found for the required key(s) ['images'].", + ), # zero missing keys, but one ("images") maps to empty values + ( + ["images", "annotations", "categories"], + { + "images": [], + "annotations": {}, + "categories": [1, 2, 3], + }, + "", + pytest.raises(ValueError), + ( + "Empty value(s) found for the required key(s) " + "['annotations', 'images']." + ), + ), # zero missing keys, but two keys map to empty values + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "", + pytest.raises(ValueError), + "Required key(s) ['images'] not found.", + ), # one missing key + ( + ["images", "annotations", "categories"], + {"annotations": ""}, + "", + pytest.raises(ValueError), + "Required key(s) ['categories', 'images'] not found.", + ), # two missing keys + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "FOO", + pytest.raises(ValueError), + "Required key(s) ['images'] not foundFOO.", + ), # one missing key with additional message for missing keys + ], +) +def test_check_required_keys_in_dict( + list_required_keys: list, + input_dict: dict, + additional_message: str, + expected_exception: pytest.raises, + expected_message: str, +): + """Test the _check_required_keys_in_dict helper function. + + The check verifies that the required keys are defined in the input + dictionary and if they are, it checks that they do not map to empty + values. + """ + with expected_exception as excinfo: + _check_required_keys_in_dict( + list_required_keys, input_dict, additional_message + ) + + # Check error message + if excinfo: + assert expected_message in str(excinfo.value) + + +def test_check_required_properties_keys(small_schema: dict): + """Test the _check_required_keys helper function.""" + # Define a sample schema from "small_schema" + # with a "properties" key missing (e.g. "c/c2") + small_schema["properties"]["c"]["properties"].pop("c2") + + # Define required "properties" keys + required_keys = ["a", "b", "c/c2"] + + # Run check + with pytest.raises(ValueError) as excinfo: + _check_required_properties_keys(required_keys, small_schema) + + # Check error message + assert "Required key(s) ['c/c2'] not found in schema" in str(excinfo.value) + + +@pytest.mark.parametrize( + "input_file,", + [ + "VIA_JSON_sample_1.json", + "VIA_JSON_sample_2.json", + ], +) +def test_required_keys_in_provided_VIA_schema( + input_file: str, default_VIA_schema: dict, annotations_test_data: dict +): + """Check the provided VIA schema contains the ValidVIA required keys.""" + # Get required keys from a VIA valid file + filepath = annotations_test_data[input_file] + valid_VIA = ValidVIA(path=filepath) + required_VIA_keys = valid_VIA.required_keys + + # Map required keys to "properties" keys in schema + map_required_to_properties_keys = { + "main": "", + "images": "_via_img_metadata", + "regions": "_via_img_metadata/regions", + "shape_attributes": "_via_img_metadata/regions/shape_attributes", + } + + # Express required keys as required "properties" keys + required_property_keys = [ + val if ky == "main" else f"{map_required_to_properties_keys[ky]}/{val}" + for ky, values in required_VIA_keys.items() + for val in values + ] + + # Run check + _check_required_properties_keys( + required_property_keys, + default_VIA_schema, + ) + + +@pytest.mark.parametrize( + "input_file,", + [ + "COCO_JSON_sample_1.json", + "COCO_JSON_sample_2.json", + ], +) +def test_required_keys_in_provided_COCO_schema( + input_file: str, default_COCO_schema: dict, annotations_test_data: dict +): + """Check the provided COCO schema contains the ValidCOCO required keys.""" + # Get required keys from a COCO valid file + filepath = annotations_test_data[input_file] + valid_COCO = ValidCOCO(path=filepath) + required_COCO_keys = valid_COCO.required_keys + + # Prepare list of required "properties" keys with full paths + required_properties_keys = [ + f"{level}/{ky}" if level != "main" else ky + for level, required_keys in required_COCO_keys.items() + for ky in required_keys + ] + + # Run check + _check_required_properties_keys( + required_properties_keys, + default_COCO_schema, + ) From f3774ea23d9fe45a3e5ddc801a12ad9d061463da Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 17:58:27 +0000 Subject: [PATCH 09/13] Fix manifest --- MANIFEST.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 6d0b1021..b4a7fbe2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,5 +12,5 @@ recursive-exclude examples * recursive-include docs * # Include json schemas -recursive-include ethology/io/annotations/json_schemas/schemas *.json -recursive-include ethology/io/annotations/json_schemas/schemas *.md +recursive-include ethology/validatorss/json_schemas/schemas *.json +recursive-include ethology/validators/json_schemas/schemas *.md From ae1dff364861c45eb1267542dfe20ec05e67e9f3 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 18:02:25 +0000 Subject: [PATCH 10/13] Fix typo in manifest --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index b4a7fbe2..c6d258bd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,5 +12,5 @@ recursive-exclude examples * recursive-include docs * # Include json schemas -recursive-include ethology/validatorss/json_schemas/schemas *.json +recursive-include ethology/validators/json_schemas/schemas *.json recursive-include ethology/validators/json_schemas/schemas *.md From 2e6ef3af8c455d6646cc5caab0c2b8cdce193f52 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:30:37 +0000 Subject: [PATCH 11/13] Remove docs warnings --- docs/source/_templates/autosummary/class.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst index 07889c22..4c075c91 100644 --- a/docs/source/_templates/autosummary/class.rst +++ b/docs/source/_templates/autosummary/class.rst @@ -3,11 +3,9 @@ .. currentmodule:: {{ module }} .. autoclass:: {{ objname }} - :members: - :show-inheritance: - :inherited-members: - :exclude-members: Config - + {% if objname != 'ValidDataset' %}:members:{% endif %} + {% if objname != 'ValidDataset' %}:inherited-members:{% endif %} + {% if objname == 'ValidBboxAnnotationsDataFrame' %}:exclude-members: Config{% endif %} {% block methods %} {% set ns = namespace(has_public_methods=false) %} From 47976f4f5631ef442e3e352a2204395c8da5dbeb Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:54:37 +0000 Subject: [PATCH 12/13] Update to autodoc defaults new syntax and simplify --- docs/source/conf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index a06e7427..6db7d86d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -70,7 +70,7 @@ # Automatically generate stub pages for API autosummary_generate = True autosummary_generate_overwrite = False -autodoc_default_flags = ["members", "inherited-members"] +autodoc_default_options = {"show-inheritance": True} # applies to all classes # Prefix section labels with the document name autosectionlabel_prefix_document = True @@ -182,6 +182,10 @@ "pandera": ("https://pandera.readthedocs.io/en/stable/", None), "movement": ("https://movement.neuroinformatics.dev/latest/", None), "sklearn": ("https://scikit-learn.org/stable/", None), + "jsonschema": ( + "https://python-jsonschema.readthedocs.io/en/stable/", + None, + ), } From 10fc95ba45105e2d965cf9dd26a87793446e1795 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:07:09 +0000 Subject: [PATCH 13/13] Review docstrings --- ethology/validators/annotations.py | 24 +++++++++++++++++------- ethology/validators/detections.py | 24 ++++++++++++++++++------ ethology/validators/utils.py | 26 ++++++++++++++++---------- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/ethology/validators/annotations.py b/ethology/validators/annotations.py index 84653110..0ecb886c 100644 --- a/ethology/validators/annotations.py +++ b/ethology/validators/annotations.py @@ -228,26 +228,36 @@ def _file_contains_unique_image_IDs(self, attribute, value): class ValidBboxAnnotationsDataset(ValidDataset): """Class for valid ``ethology`` bounding box annotations datasets. - It checks that the input dataset has: + This class validates that the input dataset: + + - is an xarray Dataset, + - has ``image_id``, ``space``, ``id`` as dimensions, + - has ``position`` and ``shape`` as data variables, + - both data variables span at least the dimensions ``image_id``, + ``space`` and ``id``. - - ``image_id``, ``space``, ``id`` as dimensions - - ``position`` and ``shape`` as data variables Attributes ---------- dataset : xarray.Dataset The xarray dataset to validate. - required_dims : set - Set of required dimension names. + required_dims : set[str] + The set of required dimension names: ``image_id``, ``space`` and + ``id``. required_data_vars : dict[str, set] - A dictionary mapping data variable names to their required dimensions. + A dictionary mapping data variable names to their required minimum + dimensions: + + - ``position`` maps to ``image_id``, ``space`` and ``id``, + - ``shape`` maps to ``image_id``, ``space`` and ``id``. Raises ------ TypeError If the input is not an xarray Dataset. ValueError - If the dataset is missing required data variables or dimensions. + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. Notes ----- diff --git a/ethology/validators/detections.py b/ethology/validators/detections.py index 62bb609e..1f6d9df6 100644 --- a/ethology/validators/detections.py +++ b/ethology/validators/detections.py @@ -9,26 +9,38 @@ class ValidBboxDetectionsDataset(ValidDataset): """Class for valid ``ethology`` bounding box detections datasets. - It checks that the input dataset has: + This class validates that the input dataset: + + - is an xarray Dataset, + - has ``image_id``, ``space``, ``id`` as dimensions, + - has ``position``, ``shape`` and ``confidence`` as data variables, + - ``position`` and ``shape`` span at least the dimensions ``image_id``, + ``space`` and ``id``, + - ``confidence`` spans at least the dimensions ``image_id`` and ``id``. - - ``image_id``, ``space``, ``id`` as dimensions - - ``position``, ``shape`` and ``confidence`` as data variables Attributes ---------- dataset : xarray.Dataset The xarray dataset to validate. required_dims : set - Set of required dimension names. + The set of required dimension names: ``image_id``, ``space`` and + ``id``. required_data_vars : dict[str, set] - A dictionary mapping data variable names to their required dimensions. + A dictionary mapping data variable names to their required minimum + dimensions: + + - ``position`` maps to ``image_id``, ``space`` and ``id``, + - ``shape`` maps to ``image_id``, ``space`` and ``id``, + - ``confidence`` maps to ``image_id`` and ``id``. Raises ------ TypeError If the input is not an xarray Dataset. ValueError - If the dataset is missing required data variables or dimensions. + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. Notes ----- diff --git a/ethology/validators/utils.py b/ethology/validators/utils.py index 4739f32d..ce74a289 100644 --- a/ethology/validators/utils.py +++ b/ethology/validators/utils.py @@ -12,10 +12,12 @@ class ValidDataset(ABC): """An abstract base class for valid ``ethology`` datasets. - It checks that the input dataset has: + This class validates that the input dataset: - - required dimensions - - required data variables + - is an xarray Dataset + - contains all required dimensions + - contains all required data variables + - has the correct dimensions for each data variable Subclasses must define ``required_dims`` and ``required_data_vars`` attributes. @@ -24,17 +26,21 @@ class ValidDataset(ABC): ---------- dataset : xarray.Dataset The xarray dataset to validate. - required_dims : set - Set of required dimension names (defined by subclasses). - required_data_vars : set - Set of required data variable names (defined by subclasses). + required_dims : set[str] + A set of required dimension names. This attribute should be + defined by any subclass inheriting from this class. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required dimensions. + This attribute should be defined by any subclass inheriting from + this class. Raises ------ TypeError If the input is not an xarray Dataset. ValueError - If the dataset is missing required data variables or dimensions. + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. Notes ----- @@ -49,13 +55,13 @@ class ValidDataset(ABC): @property @abstractmethod def required_dims(self) -> set: - """Subclasses must provide a required_dims property.""" + """Subclasses must provide a ``required_dims`` property.""" pass # pragma: no cover @property @abstractmethod def required_data_vars(self) -> dict[str, set]: - """Subclasses must provide a required_data_vars property.""" + """Subclasses must provide a ``required_data_vars`` property.""" pass # pragma: no cover # Validators