diff --git a/MANIFEST.in b/MANIFEST.in index 6d0b1021..c6d258bd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -12,5 +12,5 @@ recursive-exclude examples * recursive-include docs * # Include json schemas -recursive-include ethology/io/annotations/json_schemas/schemas *.json -recursive-include ethology/io/annotations/json_schemas/schemas *.md +recursive-include ethology/validators/json_schemas/schemas *.json +recursive-include ethology/validators/json_schemas/schemas *.md diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst index 07889c22..4c075c91 100644 --- a/docs/source/_templates/autosummary/class.rst +++ b/docs/source/_templates/autosummary/class.rst @@ -3,11 +3,9 @@ .. currentmodule:: {{ module }} .. autoclass:: {{ objname }} - :members: - :show-inheritance: - :inherited-members: - :exclude-members: Config - + {% if objname != 'ValidDataset' %}:members:{% endif %} + {% if objname != 'ValidDataset' %}:inherited-members:{% endif %} + {% if objname == 'ValidBboxAnnotationsDataFrame' %}:exclude-members: Config{% endif %} {% block methods %} {% set ns = namespace(has_public_methods=false) %} diff --git a/docs/source/conf.py b/docs/source/conf.py index a06e7427..6db7d86d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -70,7 +70,7 @@ # Automatically generate stub pages for API autosummary_generate = True autosummary_generate_overwrite = False -autodoc_default_flags = ["members", "inherited-members"] +autodoc_default_options = {"show-inheritance": True} # applies to all classes # Prefix section labels with the document name autosectionlabel_prefix_document = True @@ -182,6 +182,10 @@ "pandera": ("https://pandera.readthedocs.io/en/stable/", None), "movement": ("https://movement.neuroinformatics.dev/latest/", None), "sklearn": ("https://scikit-learn.org/stable/", None), + "jsonschema": ( + "https://python-jsonschema.readthedocs.io/en/stable/", + None, + ), } diff --git a/ethology/io/annotations/load_bboxes.py b/ethology/io/annotations/load_bboxes.py index d59d0abc..47f01fe4 100644 --- a/ethology/io/annotations/load_bboxes.py +++ b/ethology/io/annotations/load_bboxes.py @@ -10,16 +10,16 @@ import xarray as xr from pandera.typing.pandas import DataFrame -from ethology.io.annotations.validate import ( - ValidBboxesDataFrame, - ValidBboxesDataset, +from ethology.validators.annotations import ( + ValidBboxAnnotationsDataFrame, + ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, - _check_output, ) +from ethology.validators.utils import _check_output -@_check_output(ValidBboxesDataset) +@_check_output(ValidBboxAnnotationsDataset) def from_files( file_paths: Path | str | list[Path | str], format: Literal["VIA", "COCO"], @@ -138,7 +138,7 @@ def from_files( def _get_map_attributes_from_df( - df: DataFrame[ValidBboxesDataFrame], + df: DataFrame[ValidBboxAnnotationsDataFrame], ) -> tuple[dict, dict]: """Get the map attributes from the dataframe. @@ -179,7 +179,7 @@ def _get_map_attributes_from_df( @pa.check_types def _df_from_multiple_files( list_filepaths: list[Path | str], format: Literal["VIA", "COCO"] -) -> DataFrame[ValidBboxesDataFrame]: +) -> DataFrame[ValidBboxAnnotationsDataFrame]: """Read annotations from multiple files as a valid intermediate dataframe. Parameters @@ -242,7 +242,7 @@ def _df_from_multiple_files( @pa.check_types def _df_from_single_file( file_path: Path | str, format: Literal["VIA", "COCO"] -) -> DataFrame[ValidBboxesDataFrame]: +) -> DataFrame[ValidBboxAnnotationsDataFrame]: """Read annotations from a single file as a valid intermediate dataframe. Parameters @@ -374,7 +374,7 @@ def _df_rows_from_valid_VIA_file(file_path: Path) -> list[dict]: else: supercategory, category, category_id = ( - ValidBboxesDataFrame.get_empty_values()[key] + ValidBboxAnnotationsDataFrame.get_empty_values()[key] for key in ["supercategory", "category", "category_id"] ) @@ -428,7 +428,7 @@ def _get_image_shape_attr_as_integer( ValidBboxesDataFrame.get_empty_values(). """ - default_value = ValidBboxesDataFrame.get_empty_values()[ + default_value = ValidBboxAnnotationsDataFrame.get_empty_values()[ f"image_{attr_name}" ] try: @@ -557,7 +557,9 @@ def _df_rows_from_valid_COCO_file(file_path: Path) -> list[dict]: @pa.check_types -def _df_to_xarray_ds(df: DataFrame[ValidBboxesDataFrame]) -> xr.Dataset: +def _df_to_xarray_ds( + df: DataFrame[ValidBboxAnnotationsDataFrame], +) -> xr.Dataset: """Convert a bounding box annotations dataframe to an xarray dataset. Parameters @@ -585,7 +587,7 @@ def _df_to_xarray_ds(df: DataFrame[ValidBboxesDataFrame]) -> xr.Dataset: """ # Drop columns if all values in that column are empty - default_values = ValidBboxesDataFrame.get_empty_values() + default_values = ValidBboxAnnotationsDataFrame.get_empty_values() list_empty_cols = [ col for col in default_values if all(df[col] == default_values[col]) ] diff --git a/ethology/io/annotations/save_bboxes.py b/ethology/io/annotations/save_bboxes.py index bf9e09ef..a21ecf93 100644 --- a/ethology/io/annotations/save_bboxes.py +++ b/ethology/io/annotations/save_bboxes.py @@ -11,17 +11,16 @@ import xarray as xr from pandera.typing.pandas import DataFrame -from ethology.io.annotations.validate import ( - ValidBboxesDataFrameCOCO, - ValidBboxesDataset, +from ethology.validators.annotations import ( + ValidBboxAnnotationsCOCO, + ValidBboxAnnotationsDataset, ValidCOCO, - _check_input, - _check_output, ) +from ethology.validators.utils import _check_input, _check_output -@_check_input(validator=ValidBboxesDataset) -@_check_output(validator=ValidCOCO) # check output is ethology importable +@_check_input(validator=ValidBboxAnnotationsDataset) +@_check_output(validator=ValidCOCO) # check output is ethology-importable def to_COCO_file(dataset: xr.Dataset, output_filepath: str | Path): """Save an ``ethology`` bounding box annotations dataset to a COCO file. @@ -56,11 +55,11 @@ def to_COCO_file(dataset: xr.Dataset, output_filepath: str | Path): return output_filepath -@_check_input(validator=ValidBboxesDataset) +@_check_input(validator=ValidBboxAnnotationsDataset) @pa.check_types def _to_COCO_exportable_df( ds: xr.Dataset, -) -> DataFrame[ValidBboxesDataFrameCOCO]: +) -> DataFrame[ValidBboxAnnotationsCOCO]: """Convert dataset of bounding boxes annotations to a COCO-exportable df. The returned dataframe is validated using ValidBBoxesDataFrameCOCO. @@ -98,7 +97,7 @@ def _to_COCO_exportable_df( return df[cols_to_select] -@_check_input(validator=ValidBboxesDataset) +@_check_input(validator=ValidBboxAnnotationsDataset) def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: """Get preliminary dataframe from a dataset of bounding boxes annotations. @@ -164,7 +163,7 @@ def _get_raw_df_from_ds(ds: xr.Dataset) -> pd.DataFrame: @pa.check_types def _add_COCO_data_to_df( df: pd.DataFrame, ds_attrs: dict -) -> DataFrame[ValidBboxesDataFrameCOCO]: +) -> DataFrame[ValidBboxAnnotationsCOCO]: """Add COCO-required data to preliminary dataframe. The input dataframe is obtained from a dataset of bounding boxes @@ -266,7 +265,9 @@ def _add_COCO_data_to_df( @pa.check_types -def _create_COCO_dict(df: DataFrame[ValidBboxesDataFrameCOCO]) -> dict: +def _create_COCO_dict( + df: DataFrame[ValidBboxAnnotationsCOCO], +) -> dict: """Extract COCO dictionary from a COCO-exportable dataframe. Parameters @@ -282,7 +283,7 @@ def _create_COCO_dict(df: DataFrame[ValidBboxesDataFrameCOCO]) -> dict: """ COCO_dict: dict[str, Any] = {} map_columns_to_COCO_fields = ( - ValidBboxesDataFrameCOCO.map_df_columns_to_COCO_fields() + ValidBboxAnnotationsCOCO.map_df_columns_to_COCO_fields() ) for sections in ["images", "categories", "annotations"]: # Extract and rename required columns for this section diff --git a/ethology/io/annotations/validate.py b/ethology/validators/annotations.py similarity index 86% rename from ethology/io/annotations/validate.py rename to ethology/validators/annotations.py index 2e00ab92..0ecb886c 100644 --- a/ethology/io/annotations/validate.py +++ b/ethology/validators/annotations.py @@ -1,22 +1,20 @@ """Validators for annotation files and datasets.""" import json -from collections.abc import Callable -from functools import wraps from pathlib import Path import pandas as pd import pandera.pandas as pa -import xarray as xr from attrs import define, field from pandera.typing import Index -from ethology.io.annotations.json_schemas.utils import ( +from ethology.validators.json_schemas.utils import ( _check_file_is_json, _check_file_matches_schema, _check_required_keys_in_dict, _get_default_schema, ) +from ethology.validators.utils import ValidDataset @define @@ -227,25 +225,39 @@ def _file_contains_unique_image_IDs(self, attribute, value): @define -class ValidBboxesDataset: +class ValidBboxAnnotationsDataset(ValidDataset): """Class for valid ``ethology`` bounding box annotations datasets. - It checks that the input dataset has: + This class validates that the input dataset: + + - is an xarray Dataset, + - has ``image_id``, ``space``, ``id`` as dimensions, + - has ``position`` and ``shape`` as data variables, + - both data variables span at least the dimensions ``image_id``, + ``space`` and ``id``. - - ``image_id``, ``space``, ``id`` as dimensions - - ``position`` and ``shape`` as data variables Attributes ---------- dataset : xarray.Dataset The xarray dataset to validate. + required_dims : set[str] + The set of required dimension names: ``image_id``, ``space`` and + ``id``. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required minimum + dimensions: + + - ``position`` maps to ``image_id``, ``space`` and ``id``, + - ``shape`` maps to ``image_id``, ``space`` and ``id``. Raises ------ TypeError If the input is not an xarray Dataset. ValueError - If the dataset is missing required data variables or dimensions. + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. Notes ----- @@ -254,46 +266,21 @@ class ValidBboxesDataset: """ - dataset: xr.Dataset = field() - - # Minimum requirements for annotations datasets holding bboxes + # Minimum requirements for a bbox dataset holding detections required_dims: set = field( default={"image_id", "space", "id"}, init=False, ) - required_data_vars: set = field( - default={"position", "shape"}, + required_data_vars: dict = field( + default={ + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + }, init=False, ) - @dataset.validator - def _check_dataset_type(self, attribute, value): - """Ensure the input is an xarray Dataset.""" - if not isinstance(value, xr.Dataset): - raise TypeError( - f"Expected an xarray Dataset, but got {type(value)}." - ) - - @dataset.validator - def _check_required_data_variables(self, attribute, value): - """Ensure the dataset has all required data variables.""" - missing_vars = self.required_data_vars - set(value.data_vars) - if missing_vars: - raise ValueError( - f"Missing required data variables: {sorted(missing_vars)}" - ) - - @dataset.validator - def _check_required_dimensions(self, attribute, value): - """Ensure the dataset has all required dimensions.""" - missing_dims = self.required_dims - set(value.dims) - if missing_dims: - raise ValueError( - f"Missing required dimensions: {sorted(missing_dims)}" - ) - -class ValidBboxesDataFrame(pa.DataFrameModel): +class ValidBboxAnnotationsDataFrame(pa.DataFrameModel): """Class for valid bounding boxes intermediate dataframes. We use this dataframe internally as an intermediate step in the process of @@ -422,7 +409,7 @@ def get_empty_values() -> dict: } -class ValidBboxesDataFrameCOCO(pa.DataFrameModel): +class ValidBboxAnnotationsCOCO(pa.DataFrameModel): """Class for COCO-exportable bounding box annotations dataframes. The validation checks the required columns exist and their types are @@ -573,38 +560,3 @@ def check_idx_and_annotation_id(cls, df: pd.DataFrame) -> bool: """ return all(df.index == df["annotation_id"]) - - -def _check_output(validator: type): - """Return a decorator that validates the output of a function.""" - - def decorator(function: Callable) -> Callable: - @wraps(function) # to preserve function metadata - def wrapper(*args, **kwargs): - result = function(*args, **kwargs) - validator(result) - return result - - return wrapper - - return decorator - - -def _check_input(validator: type, input_index: int = 0): - """Return a decorator that validates a specific input of a function. - - By default, the first input is validated. If the input index is - larger than the number of inputs, no validation is performed. - """ - - def decorator(function: Callable) -> Callable: - @wraps(function) - def wrapper(*args, **kwargs): - if len(args) > input_index: - validator(args[input_index]) - result = function(*args, **kwargs) - return result - - return wrapper - - return decorator diff --git a/ethology/validators/detections.py b/ethology/validators/detections.py new file mode 100644 index 00000000..1f6d9df6 --- /dev/null +++ b/ethology/validators/detections.py @@ -0,0 +1,64 @@ +"""Validators for detection datasets.""" + +from attrs import define, field + +from ethology.validators.utils import ValidDataset + + +@define +class ValidBboxDetectionsDataset(ValidDataset): + """Class for valid ``ethology`` bounding box detections datasets. + + This class validates that the input dataset: + + - is an xarray Dataset, + - has ``image_id``, ``space``, ``id`` as dimensions, + - has ``position``, ``shape`` and ``confidence`` as data variables, + - ``position`` and ``shape`` span at least the dimensions ``image_id``, + ``space`` and ``id``, + - ``confidence`` spans at least the dimensions ``image_id`` and ``id``. + + + Attributes + ---------- + dataset : xarray.Dataset + The xarray dataset to validate. + required_dims : set + The set of required dimension names: ``image_id``, ``space`` and + ``id``. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required minimum + dimensions: + + - ``position`` maps to ``image_id``, ``space`` and ``id``, + - ``shape`` maps to ``image_id``, ``space`` and ``id``, + - ``confidence`` maps to ``image_id`` and ``id``. + + Raises + ------ + TypeError + If the input is not an xarray Dataset. + ValueError + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. + + Notes + ----- + The dataset can have other data variables and dimensions, but only the + required ones are checked. + + """ + + # Minimum requirements for a bbox dataset holding detections + required_dims: set = field( + default={"image_id", "space", "id"}, + init=False, + ) + required_data_vars: dict = field( + default={ + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + "confidence": {"image_id", "id"}, + }, + init=False, + ) diff --git a/ethology/io/annotations/json_schemas/__init__.py b/ethology/validators/json_schemas/__init__.py similarity index 100% rename from ethology/io/annotations/json_schemas/__init__.py rename to ethology/validators/json_schemas/__init__.py diff --git a/ethology/io/annotations/json_schemas/schemas/COCO_schema.json b/ethology/validators/json_schemas/schemas/COCO_schema.json similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/COCO_schema.json rename to ethology/validators/json_schemas/schemas/COCO_schema.json diff --git a/ethology/io/annotations/json_schemas/schemas/README.md b/ethology/validators/json_schemas/schemas/README.md similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/README.md rename to ethology/validators/json_schemas/schemas/README.md diff --git a/ethology/io/annotations/json_schemas/schemas/VIA_schema.json b/ethology/validators/json_schemas/schemas/VIA_schema.json similarity index 100% rename from ethology/io/annotations/json_schemas/schemas/VIA_schema.json rename to ethology/validators/json_schemas/schemas/VIA_schema.json diff --git a/ethology/io/annotations/json_schemas/utils.py b/ethology/validators/json_schemas/utils.py similarity index 100% rename from ethology/io/annotations/json_schemas/utils.py rename to ethology/validators/json_schemas/utils.py diff --git a/ethology/validators/utils.py b/ethology/validators/utils.py new file mode 100644 index 00000000..ce74a289 --- /dev/null +++ b/ethology/validators/utils.py @@ -0,0 +1,147 @@ +"""Utils for validating `ethology` objects.""" + +from abc import ABC, abstractmethod +from collections.abc import Callable +from functools import wraps + +import xarray as xr +from attrs import define, field + + +@define +class ValidDataset(ABC): + """An abstract base class for valid ``ethology`` datasets. + + This class validates that the input dataset: + + - is an xarray Dataset + - contains all required dimensions + - contains all required data variables + - has the correct dimensions for each data variable + + Subclasses must define ``required_dims`` and ``required_data_vars`` + attributes. + + Attributes + ---------- + dataset : xarray.Dataset + The xarray dataset to validate. + required_dims : set[str] + A set of required dimension names. This attribute should be + defined by any subclass inheriting from this class. + required_data_vars : dict[str, set] + A dictionary mapping data variable names to their required dimensions. + This attribute should be defined by any subclass inheriting from + this class. + + Raises + ------ + TypeError + If the input is not an xarray Dataset. + ValueError + If the dataset is missing required data variables or dimensions, + or if any required dimensions are missing for any data variable. + + Notes + ----- + The dataset can have other data variables and dimensions, but only the + required ones are checked. + + """ + + dataset: xr.Dataset = field() + + # Subclasses should override these abstract properties + @property + @abstractmethod + def required_dims(self) -> set: + """Subclasses must provide a ``required_dims`` property.""" + pass # pragma: no cover + + @property + @abstractmethod + def required_data_vars(self) -> dict[str, set]: + """Subclasses must provide a ``required_data_vars`` property.""" + pass # pragma: no cover + + # Validators + @dataset.validator + def _check_dataset_type(self, attribute, value): + """Ensure the input is an xarray Dataset.""" + if not isinstance(value, xr.Dataset): + raise TypeError( + f"Expected an xarray Dataset, but got {type(value)}." + ) + + @dataset.validator + def _check_required_data_variables(self, attribute, value): + """Ensure the dataset has all required data variables.""" + missing_vars = self.required_data_vars.keys() - set(value.data_vars) + if missing_vars: + raise ValueError( + f"Missing required data variables: {sorted(missing_vars)}" + ) + + @dataset.validator + def _check_required_dimensions(self, attribute, value): + """Ensure the dataset has all required dimensions.""" + missing_dims = self.required_dims - set(value.dims) + if missing_dims: + raise ValueError( + f"Missing required dimensions: {sorted(missing_dims)}" + ) + + @dataset.validator + def _check_dimensions_per_data_variable(self, attribute, value): + """Ensure the dataset has all required dimensions.""" + error_messages = [] + for data_var, dims_per_data_var in self.required_data_vars.items(): + missing_dims = dims_per_data_var - set( + value.data_vars[data_var].coords + ) + if missing_dims: + error_messages.append( + f"data variable '{data_var}' is missing " + f"dimensions {sorted(missing_dims)}" + ) + + if error_messages: + raise ValueError( + "Some data variables are missing required dimensions:\n - " + + "\n - ".join(error_messages) + ) + + +def _check_output(validator: type): + """Return a decorator that validates the output of a function.""" + + def decorator(function: Callable) -> Callable: + @wraps(function) # to preserve function metadata + def wrapper(*args, **kwargs): + result = function(*args, **kwargs) + validator(result) + return result + + return wrapper + + return decorator + + +def _check_input(validator: type, input_index: int = 0): + """Return a decorator that validates a specific input of a function. + + By default, the first input is validated. If the input index is + larger than the number of inputs, no validation is performed. + """ + + def decorator(function: Callable) -> Callable: + @wraps(function) + def wrapper(*args, **kwargs): + if len(args) > input_index: + validator(args[input_index]) + result = function(*args, **kwargs) + return result + + return wrapper + + return decorator diff --git a/tests/fixtures/annotations.py b/tests/fixtures/annotations.py index 5fd95a8f..eb3830fa 100644 --- a/tests/fixtures/annotations.py +++ b/tests/fixtures/annotations.py @@ -117,26 +117,10 @@ def small_schema() -> dict: } -@pytest.fixture() -def default_VIA_schema() -> dict: - """Get default VIA schema.""" - from ethology.io.annotations.json_schemas.utils import _get_default_schema - - return _get_default_schema("VIA") - - -@pytest.fixture() -def default_COCO_schema() -> dict: - """Get default COCO schema.""" - from ethology.io.annotations.json_schemas.utils import _get_default_schema - - return _get_default_schema("COCO") - - # ----------------- Bboxes dataset validation fixtures ----------------- @pytest.fixture -def valid_bboxes_dataset(): - """Create a valid xarray dataset for bboxes validation.""" +def valid_bbox_annotations_dataset(): + """Create a valid bbox annotations dataset for validation.""" image_ids = [1, 2, 3] annotation_ids = [0, 1, 2] # three per frame space_dims = ["x", "y"] @@ -145,13 +129,13 @@ def valid_bboxes_dataset(): position_data = np.zeros( (len(image_ids), len(space_dims), len(annotation_ids)) ) - shape_data = np.zeros((len(image_ids), len(annotation_ids))) + shape_data = np.copy(position_data) # Create the dataset ds = xr.Dataset( data_vars={ "position": (["image_id", "space", "id"], position_data), - "shape": (["image_id", "id"], shape_data), + "shape": (["image_id", "space", "id"], shape_data), }, coords={ "image_id": image_ids, @@ -164,10 +148,15 @@ def valid_bboxes_dataset(): @pytest.fixture -def valid_bboxes_dataset_extra_vars_and_dims( - valid_bboxes_dataset: xr.Dataset, +def valid_bbox_annotations_dataset_extra_vars_and_dims( + valid_bbox_annotations_dataset: xr.Dataset, ) -> xr.Dataset: - ds = valid_bboxes_dataset.copy(deep=True) + """Create a valid bbox annotations dataset for validation. + + The dataset is valid but contains more variables and dimensions than + the minimum required for a bbox annotations dataset. + """ + ds = valid_bbox_annotations_dataset.copy(deep=True) ds.coords["extra_dim"] = [10, 20, 30] ds["extra_var_1"] = (["image_id"], np.random.rand(len(ds.image_id))) ds["extra_var_2"] = (["id"], np.random.rand(len(ds.id))) diff --git a/tests/test_unit/test_datasets/test_split.py b/tests/test_unit/test_datasets/test_split.py index c9846262..23674cdf 100644 --- a/tests/test_unit/test_datasets/test_split.py +++ b/tests/test_unit/test_datasets/test_split.py @@ -23,12 +23,12 @@ def split_at_any_delimiter(text: str, delimiters: list[str]) -> list[str]: @pytest.fixture -def valid_bboxes_dataset_to_split_1(valid_bboxes_dataset): +def valid_bbox_annotations_ds_to_split_1(valid_bbox_annotations_dataset): # We add a `foo` variable to the dataset that is # one-dimensional along the `image_id` dimension to # use for grouping by. # Note: len(valid_bboxes_dataset.image_id) = 3 - ds = valid_bboxes_dataset.copy(deep=True) + ds = valid_bbox_annotations_dataset.copy(deep=True) ds["foo"] = ( ["image_id"], np.array([0, 1, 1]), @@ -37,14 +37,14 @@ def valid_bboxes_dataset_to_split_1(valid_bboxes_dataset): @pytest.fixture -def valid_bboxes_dataset_to_split_2(valid_bboxes_dataset): +def valid_bbox_annotations_ds_to_split_2(valid_bbox_annotations_dataset): # We add a `foo` variable to the dataset that is # one-dimensional along the `image_id` dimension to # use for grouping by. In this case we ensure we have # 3 groups to be able to split using 3 folds (with # GroupKFold we cannot have more folds than groups). # Note: len(valid_bboxes_dataset.image_id) = 3 - ds = valid_bboxes_dataset.copy(deep=True) + ds = valid_bbox_annotations_dataset.copy(deep=True) ds["foo"] = ( ["image_id"], np.array([0, 1, 2]), @@ -150,12 +150,12 @@ def test_approximate_subset_sum(inputs, expected_subset_dict): "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order @@ -207,12 +207,12 @@ def test_split_dataset_group_by_apss(inputs, request): "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_2", + "dataset": "valid_bbox_annotations_ds_to_split_2", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_2", + "dataset": "valid_bbox_annotations_ds_to_split_2", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order @@ -258,10 +258,12 @@ def test_split_dataset_group_by_kfold(inputs, request): ) -def test_split_dataset_group_by_kfold_seed(valid_bboxes_dataset_to_split_2): +def test_split_dataset_group_by_kfold_seed( + valid_bbox_annotations_ds_to_split_2, +): """Test the behaviour of the seed when using the `kfold` method.""" # prepare inputs - dataset = valid_bboxes_dataset_to_split_2 + dataset = valid_bbox_annotations_ds_to_split_2 list_fractions = [0.334, 0.666] samples_coordinate = "image_id" group_by_var = "foo" @@ -313,7 +315,7 @@ def test_split_dataset_group_by_kfold_seed(valid_bboxes_dataset_to_split_2): ], ) def test_split_dataset_group_by( - method, function_to_mock, valid_bboxes_dataset_to_split_1 + method, function_to_mock, valid_bbox_annotations_ds_to_split_1 ): """Test the wrapper function dispatches to the appropriate method.""" # Create mock return datasets @@ -322,7 +324,7 @@ def test_split_dataset_group_by( # Patch the internal function and call the wrapper with patch(function_to_mock, return_value=mock_return_value) as mock: _ds_subset_1, _ds_subset_2 = split_dataset_group_by( - dataset=valid_bboxes_dataset_to_split_1, + dataset=valid_bbox_annotations_ds_to_split_1, group_by_var="foo", list_fractions=[0.334, 0.666], samples_coordinate="image_id", @@ -336,8 +338,8 @@ def test_split_dataset_group_by( @pytest.mark.parametrize( "dataset, expected_method", [ - ("valid_bboxes_dataset_to_split_1", "apss"), - ("valid_bboxes_dataset_to_split_2", "kfold"), + ("valid_bbox_annotations_ds_to_split_1", "apss"), + ("valid_bbox_annotations_ds_to_split_2", "kfold"), ], ) def test_split_dataset_group_by_auto(dataset, expected_method, request): @@ -365,12 +367,12 @@ def test_split_dataset_group_by_auto(dataset, expected_method, request): def test_split_dataset_group_by_unknown_method( - valid_bboxes_dataset_to_split_1, + valid_bbox_annotations_ds_to_split_1, ): """Test that an unknown method raises a ValueError.""" with pytest.raises(ValueError, match="Unknown method"): split_dataset_group_by( - dataset=valid_bboxes_dataset_to_split_1, + dataset=valid_bbox_annotations_ds_to_split_1, group_by_var="foo", list_fractions=[0.5, 0.5], method="unknown_method", @@ -381,17 +383,17 @@ def test_split_dataset_group_by_unknown_method( "inputs", [ { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.334, 0.666], "samples_coordinate": "image_id", }, # fractions in increasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [0.666, 0.334], "samples_coordinate": "image_id", }, # fractions in decreasing order { - "dataset": "valid_bboxes_dataset_to_split_1", + "dataset": "valid_bbox_annotations_ds_to_split_1", "list_fractions": [1 / 3, 1 / 3, 1 / 3], "samples_coordinate": "image_id", }, # more than two fractions @@ -444,26 +446,26 @@ def test_split_dataset_random(inputs, request): [ ( "auto", - "valid_bboxes_dataset_to_split_1", + "valid_bbox_annotations_ds_to_split_1", # dataset that will trigger auto-selection of apss # with the requested fractions 0.334 and 0.666 "Auto-selected approximate subset-sum method", ), ( "auto", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so kfold method can be used "Using group k-fold method with", ), ( "kfold", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so kfold method can be used "Using group k-fold method with", ), ( "apss", - "valid_bboxes_dataset_to_split_2", + "valid_bbox_annotations_ds_to_split_2", # dataset with 3 groups so apss method can be used "Using approximate subset-sum method with", ), @@ -639,7 +641,7 @@ def test_split_dataset_warning_empty_subset( ): """Test that a warning is thrown when at least one subset is empty.""" # Get dataset to split - ds = request.getfixturevalue("valid_bboxes_dataset_to_split_1") + ds = request.getfixturevalue("valid_bbox_annotations_ds_to_split_1") inputs["dataset"] = ds # We use fractions that will cause an empty subset diff --git a/tests/test_unit/test_io_annotations/test_save_bboxes.py b/tests/test_unit/test_io_annotations/test_save_bboxes.py index fb32e978..21a0c60f 100644 --- a/tests/test_unit/test_io_annotations/test_save_bboxes.py +++ b/tests/test_unit/test_io_annotations/test_save_bboxes.py @@ -16,7 +16,7 @@ _get_raw_df_from_ds, to_COCO_file, ) -from ethology.io.annotations.validate import ValidBboxesDataFrameCOCO +from ethology.validators.annotations import ValidBboxAnnotationsCOCO def read_JSON_as_dict(file_path: str | Path) -> dict: @@ -146,7 +146,7 @@ def _sample_bboxes_df_drop( ).set_index("annotation_id", drop=False) # Validate as COCO-exportable - df = ValidBboxesDataFrameCOCO.validate(df) + df = ValidBboxAnnotationsCOCO.validate(df) # Drop columns if specified if columns_to_drop: @@ -216,7 +216,7 @@ def test_validate_bboxes_df_COCO( df_factory = request.getfixturevalue(df) df = df_factory() with expected_exception as excinfo: - ValidBboxesDataFrameCOCO(df) + ValidBboxAnnotationsCOCO(df) if excinfo: assert expected_error_message in str(excinfo.value) @@ -366,7 +366,7 @@ def test_create_COCO_dict(sample_bboxes_df: Callable): # Check keys in each section map_df_columns_to_coco = copy.deepcopy( - ValidBboxesDataFrameCOCO.map_df_columns_to_COCO_fields() + ValidBboxAnnotationsCOCO.map_df_columns_to_COCO_fields() ) for section, section_mapping in map_df_columns_to_coco.items(): assert all( diff --git a/tests/test_unit/test_io_annotations/test_validators.py b/tests/test_unit/test_validators/test_annotations.py similarity index 52% rename from tests/test_unit/test_io_annotations/test_validators.py rename to tests/test_unit/test_validators/test_annotations.py index d054da27..fbcc77ee 100644 --- a/tests/test_unit/test_io_annotations/test_validators.py +++ b/tests/test_unit/test_validators/test_annotations.py @@ -5,13 +5,8 @@ import pytest import xarray as xr -from ethology.io.annotations.json_schemas.utils import ( - _check_required_keys_in_dict, - _check_required_properties_keys, - _extract_properties_keys, -) -from ethology.io.annotations.validate import ( - ValidBboxesDataset, +from ethology.validators.annotations import ( + ValidBboxAnnotationsDataset, ValidCOCO, ValidVIA, ) @@ -101,248 +96,6 @@ def test_validators_invalid_input_files( assert invalid_json_file.name in str(excinfo.value) -@pytest.mark.parametrize( - "schema, expected_properties_keys", - [ - ("small_schema", ["a", "b", "b/b1", "c", "c/c1", "c/c2"]), - ( - "default_VIA_schema", - [ - "_via_attributes", - "_via_attributes/file", - "_via_attributes/region", - "_via_attributes/region/default_options", - "_via_attributes/region/description", - "_via_attributes/region/options", - "_via_attributes/region/type", - "_via_data_format_version", - "_via_image_id_list", - "_via_img_metadata", - "_via_img_metadata/file_attributes", - "_via_img_metadata/filename", - "_via_img_metadata/regions", - "_via_img_metadata/regions/region_attributes", - "_via_img_metadata/regions/shape_attributes", - "_via_img_metadata/regions/shape_attributes/height", - "_via_img_metadata/regions/shape_attributes/name", - "_via_img_metadata/regions/shape_attributes/width", - "_via_img_metadata/regions/shape_attributes/x", - "_via_img_metadata/regions/shape_attributes/y", - "_via_img_metadata/size", - "_via_settings", - "_via_settings/core", - "_via_settings/project", - "_via_settings/ui", - ], - ), - ( - "default_COCO_schema", - [ - "annotations", - "annotations/area", - "annotations/bbox", - "annotations/category_id", - "annotations/id", - "annotations/image_id", - "annotations/iscrowd", - "categories", - "categories/id", - "categories/name", - "categories/supercategory", - "images", - "images/file_name", - "images/height", - "images/id", - "images/width", - "info", - "licenses", - ], - ), - ], -) -def test_extract_properties_keys( - schema: dict, - expected_properties_keys: list, - request: pytest.FixtureRequest, -): - """Test the _extract_properties_keys helper function.""" - schema = request.getfixturevalue(schema) - assert _extract_properties_keys(schema) == sorted(expected_properties_keys) - - -@pytest.mark.parametrize( - ( - "list_required_keys, input_dict, additional_message, " - "expected_exception, expected_message" - ), - [ - ( - ["images", "annotations", "categories"], - { - "images": [1, 2, 3], - "annotations": [1, 2, 3], - "categories": [1, 2, 3], - }, - "", - does_not_raise(), - "", - ), # zero missing keys, and all keys map to non-empty values - ( - ["images", "annotations", "categories"], - { - "images": [], - "annotations": [1, 2, 3], - "categories": [1, 2, 3], - }, - "", - pytest.raises(ValueError), - "Empty value(s) found for the required key(s) ['images'].", - ), # zero missing keys, but one ("images") maps to empty values - ( - ["images", "annotations", "categories"], - { - "images": [], - "annotations": {}, - "categories": [1, 2, 3], - }, - "", - pytest.raises(ValueError), - ( - "Empty value(s) found for the required key(s) " - "['annotations', 'images']." - ), - ), # zero missing keys, but two keys map to empty values - ( - ["images", "annotations", "categories"], - {"annotations": "", "categories": ""}, - "", - pytest.raises(ValueError), - "Required key(s) ['images'] not found.", - ), # one missing key - ( - ["images", "annotations", "categories"], - {"annotations": ""}, - "", - pytest.raises(ValueError), - "Required key(s) ['categories', 'images'] not found.", - ), # two missing keys - ( - ["images", "annotations", "categories"], - {"annotations": "", "categories": ""}, - "FOO", - pytest.raises(ValueError), - "Required key(s) ['images'] not foundFOO.", - ), # one missing key with additional message for missing keys - ], -) -def test_check_required_keys_in_dict( - list_required_keys: list, - input_dict: dict, - additional_message: str, - expected_exception: pytest.raises, - expected_message: str, -): - """Test the _check_required_keys_in_dict helper function. - - The check verifies that the required keys are defined in the input - dictionary and if they are, it checks that they do not map to empty - values. - """ - with expected_exception as excinfo: - _check_required_keys_in_dict( - list_required_keys, input_dict, additional_message - ) - - # Check error message - if excinfo: - assert expected_message in str(excinfo.value) - - -def test_check_required_properties_keys(small_schema: dict): - """Test the _check_required_keys helper function.""" - # Define a sample schema from "small_schema" - # with a "properties" key missing (e.g. "c/c2") - small_schema["properties"]["c"]["properties"].pop("c2") - - # Define required "properties" keys - required_keys = ["a", "b", "c/c2"] - - # Run check - with pytest.raises(ValueError) as excinfo: - _check_required_properties_keys(required_keys, small_schema) - - # Check error message - assert "Required key(s) ['c/c2'] not found in schema" in str(excinfo.value) - - -@pytest.mark.parametrize( - "input_file,", - [ - "VIA_JSON_sample_1.json", - "VIA_JSON_sample_2.json", - ], -) -def test_required_keys_in_provided_VIA_schema( - input_file: str, default_VIA_schema: dict, annotations_test_data: dict -): - """Check the provided VIA schema contains the ValidVIA required keys.""" - # Get required keys from a VIA valid file - filepath = annotations_test_data[input_file] - valid_VIA = ValidVIA(path=filepath) - required_VIA_keys = valid_VIA.required_keys - - # Map required keys to "properties" keys in schema - map_required_to_properties_keys = { - "main": "", - "images": "_via_img_metadata", - "regions": "_via_img_metadata/regions", - "shape_attributes": "_via_img_metadata/regions/shape_attributes", - } - - # Express required keys as required "properties" keys - required_property_keys = [ - val if ky == "main" else f"{map_required_to_properties_keys[ky]}/{val}" - for ky, values in required_VIA_keys.items() - for val in values - ] - - # Run check - _check_required_properties_keys( - required_property_keys, - default_VIA_schema, - ) - - -@pytest.mark.parametrize( - "input_file,", - [ - "COCO_JSON_sample_1.json", - "COCO_JSON_sample_2.json", - ], -) -def test_required_keys_in_provided_COCO_schema( - input_file: str, default_COCO_schema: dict, annotations_test_data: dict -): - """Check the provided COCO schema contains the ValidCOCO required keys.""" - # Get required keys from a COCO valid file - filepath = annotations_test_data[input_file] - valid_COCO = ValidCOCO(path=filepath) - required_COCO_keys = valid_COCO.required_keys - - # Prepare list of required "properties" keys with full paths - required_properties_keys = [ - f"{level}/{ky}" if level != "main" else ky - for level, required_keys in required_COCO_keys.items() - for ky in required_keys - ] - - # Run check - _check_required_properties_keys( - required_properties_keys, - default_COCO_schema, - ) - - @pytest.mark.parametrize( "validator, input_file, expected_exception", [ @@ -460,12 +213,33 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): "sample_dataset, expected_exception, expected_error_message", [ ( - "valid_bboxes_dataset", + "valid_bbox_annotations_dataset", does_not_raise(), "", ), ( - "valid_bboxes_dataset_extra_vars_and_dims", + "valid_bbox_annotations_dataset_extra_vars_and_dims", + does_not_raise(), + "", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id", "foo"], + np.zeros((3, 2, 2, 1)), + ), + }, + ), does_not_raise(), "", ), @@ -540,24 +314,50 @@ def test_COCO_non_unique_image_IDs(annotations_test_data: dict): pytest.raises(ValueError), "Missing required dimensions: ['image_id', 'space']", ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + ( + "Some data variables are missing required dimensions:" + "\n - data variable 'shape' is missing dimensions ['space']" + ), + ), ], ids=[ - "valid_bboxes_dataset", - "valid_bboxes_dataset_extra_vars_and_dims", - "invalid_bboxes_dataset_type", - "invalid_bboxes_dataset_missing_data_var", - "invalid_bboxes_dataset_missing_multiple_data_vars", - "invalid_bboxes_dataset_missing_dimension", - "invalid_bboxes_dataset_missing_multiple_dimensions", + "valid_bbox_annotations", + "valid_bbox_annotations_extra_vars_and_dims", + "valid_bbox_detections_extra_dims_in_shape_var", + "invalid_bbox_annotations_type", + "invalid_bbox_annotations_missing_data_var", + "invalid_bbox_annotations_missing_multiple_data_vars", + "invalid_bbox_annotations_missing_dimension", + "invalid_bbox_annotations_missing_multiple_dimensions", + "invalid_bbox_annotations_missing_dimension_in_data_var", ], ) -def test_valid_bboxes_dataset_validation( +def test_validator_bbox_annotations_dataset( sample_dataset: str | dict, expected_exception: pytest.raises, expected_error_message: str, request: pytest.FixtureRequest, ): - """Test ValidBboxesDataset validation with various input scenarios.""" + """Test bbox annotations dataset validation in various input scenarios.""" # Get dataset to validate if isinstance(sample_dataset, str): dataset = request.getfixturevalue(sample_dataset) @@ -566,7 +366,7 @@ def test_valid_bboxes_dataset_validation( # Run validation and check exception with expected_exception as excinfo: - validator = ValidBboxesDataset(dataset=dataset) + validator = ValidBboxAnnotationsDataset(dataset=dataset) if excinfo: error_msg = str(excinfo.value) @@ -574,4 +374,7 @@ def test_valid_bboxes_dataset_validation( else: assert validator.dataset is dataset assert validator.required_dims == {"image_id", "space", "id"} - assert validator.required_data_vars == {"position", "shape"} + assert validator.required_data_vars == { + "position": {"id", "image_id", "space"}, + "shape": {"id", "image_id", "space"}, + } diff --git a/tests/test_unit/test_validators/test_detections.py b/tests/test_unit/test_validators/test_detections.py new file mode 100644 index 00000000..d053d6ef --- /dev/null +++ b/tests/test_unit/test_validators/test_detections.py @@ -0,0 +1,238 @@ +from contextlib import nullcontext as does_not_raise + +import numpy as np +import pytest +import xarray as xr + +from ethology.validators.detections import ValidBboxDetectionsDataset + + +@pytest.fixture +def valid_bbox_detections_dataset(): + """Create a valid bbox detections dataset for validation.""" + image_ids = [1, 2, 3] + annotation_ids = [0, 1, 2] # max 3 bboxes per frame + space_dims = ["x", "y"] + + # Create position, shape and confidence data all zeros + position_data = np.zeros( + (len(image_ids), len(space_dims), len(annotation_ids)) + ) + shape_data = np.copy(position_data) + confidence_data = np.zeros((len(image_ids), len(annotation_ids))) + + # Create the dataset + ds = xr.Dataset( + data_vars={ + "position": (["image_id", "space", "id"], position_data), + "shape": (["image_id", "space", "id"], shape_data), + "confidence": (["image_id", "id"], confidence_data), + }, + coords={ + "image_id": image_ids, + "space": ["x", "y"], + "id": annotation_ids, + }, + ) + + return ds + + +@pytest.fixture +def valid_bbox_detections_dataset_extra_vars_and_dims( + valid_bbox_detections_dataset: xr.Dataset, +) -> xr.Dataset: + ds = valid_bbox_detections_dataset.copy(deep=True) + ds.coords["extra_dim"] = [10, 20, 30] + ds["extra_var_1"] = (["image_id"], np.random.rand(len(ds.image_id))) + ds["extra_var_2"] = (["id"], np.random.rand(len(ds.id))) + return ds + + +@pytest.mark.parametrize( + "sample_dataset, expected_exception, expected_error_message", + [ + ( + "valid_bbox_detections_dataset", + does_not_raise(), + "", + ), + ( + "valid_bbox_detections_dataset_extra_vars_and_dims", + does_not_raise(), + "", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id", "foo"], + np.zeros((3, 2, 2, 1)), + ), + "confidence": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + does_not_raise(), + "", + ), + ( + {"position": [1, 2, 3], "shape": [4, 5, 6]}, + pytest.raises(TypeError), + "Expected an xarray Dataset, but got .", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required data variables: ['confidence']", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required data variables: ['confidence', 'shape']", + ), + ( + xr.Dataset( + coords={"image_id": np.arange(3), "id": np.arange(2)}, + data_vars={ + "position": (["image_id", "id"], np.zeros((3, 2))), + "shape": (["image_id", "id"], np.zeros((3, 2))), + "confidence": (["image_id", "id"], np.zeros((3, 2))), + }, + ), + pytest.raises(ValueError), + "Missing required dimensions: ['space']", + ), + ( + xr.Dataset( + coords={ + "foo": np.arange(3), + "bar": ["x", "y"], + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["foo", "bar", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["foo", "bar", "id"], + np.zeros((3, 2, 2)), + ), + "confidence": ( + ["foo", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + "Missing required dimensions: ['image_id', 'space']", + ), + ( + xr.Dataset( + coords={ + "image_id": np.arange(3), + "space": np.arange(2), + "id": np.arange(2), + }, + data_vars={ + "position": ( + ["image_id", "space", "id"], + np.zeros((3, 2, 2)), + ), + "shape": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + "confidence": ( + ["image_id", "id"], + np.zeros((3, 2)), + ), + }, + ), + pytest.raises(ValueError), + ( + "Some data variables are missing required dimensions:" + "\n - data variable 'shape' is missing dimensions ['space']" + ), + ), + ], + ids=[ + "valid_bbox_detections", + "valid_bbox_detections_extra_vars_and_dims", + "valid_bbox_detections_extra_dims_in_shape_var", + "invalid_bbox_detections_type", + "invalid_bbox_detections_dataset_missing_data_var", + "invalid_bbox_detections_missing_multiple_data_vars", + "invalid_bbox_detections_missing_dimension", + "invalid_bbox_detections_missing_multiple_dimensions", + "invalid_bbox_detections_missing_dimension_in_data_var", + ], +) +def test_validator_bbox_detections_dataset( + sample_dataset: str | dict, + expected_exception: pytest.raises, + expected_error_message: str, + request: pytest.FixtureRequest, +): + """Test bbox annotations dataset validation in various input scenarios.""" + # Get dataset to validate + if isinstance(sample_dataset, str): + dataset = request.getfixturevalue(sample_dataset) + else: + dataset = sample_dataset + + # Run validation and check exception + with expected_exception as excinfo: + validator = ValidBboxDetectionsDataset(dataset=dataset) + + if excinfo: + error_msg = str(excinfo.value) + assert error_msg in expected_error_message + else: + assert validator.dataset is dataset + assert validator.required_dims == {"image_id", "space", "id"} + assert validator.required_data_vars == { + "position": {"image_id", "space", "id"}, + "shape": {"image_id", "space", "id"}, + "confidence": {"image_id", "id"}, + } diff --git a/tests/test_unit/test_validators/test_json_schemas.py b/tests/test_unit/test_validators/test_json_schemas.py new file mode 100644 index 00000000..f496043f --- /dev/null +++ b/tests/test_unit/test_validators/test_json_schemas.py @@ -0,0 +1,268 @@ +from contextlib import nullcontext as does_not_raise + +import pytest + +from ethology.validators.annotations import ValidCOCO, ValidVIA +from ethology.validators.json_schemas.utils import ( + _check_required_keys_in_dict, + _check_required_properties_keys, + _extract_properties_keys, +) + + +@pytest.fixture() +def default_VIA_schema() -> dict: + """Get default VIA schema.""" + from ethology.validators.json_schemas.utils import _get_default_schema + + return _get_default_schema("VIA") + + +@pytest.fixture() +def default_COCO_schema() -> dict: + """Get default COCO schema.""" + from ethology.validators.json_schemas.utils import _get_default_schema + + return _get_default_schema("COCO") + + +@pytest.mark.parametrize( + "schema, expected_properties_keys", + [ + ("small_schema", ["a", "b", "b/b1", "c", "c/c1", "c/c2"]), + ( + "default_VIA_schema", + [ + "_via_attributes", + "_via_attributes/file", + "_via_attributes/region", + "_via_attributes/region/default_options", + "_via_attributes/region/description", + "_via_attributes/region/options", + "_via_attributes/region/type", + "_via_data_format_version", + "_via_image_id_list", + "_via_img_metadata", + "_via_img_metadata/file_attributes", + "_via_img_metadata/filename", + "_via_img_metadata/regions", + "_via_img_metadata/regions/region_attributes", + "_via_img_metadata/regions/shape_attributes", + "_via_img_metadata/regions/shape_attributes/height", + "_via_img_metadata/regions/shape_attributes/name", + "_via_img_metadata/regions/shape_attributes/width", + "_via_img_metadata/regions/shape_attributes/x", + "_via_img_metadata/regions/shape_attributes/y", + "_via_img_metadata/size", + "_via_settings", + "_via_settings/core", + "_via_settings/project", + "_via_settings/ui", + ], + ), + ( + "default_COCO_schema", + [ + "annotations", + "annotations/area", + "annotations/bbox", + "annotations/category_id", + "annotations/id", + "annotations/image_id", + "annotations/iscrowd", + "categories", + "categories/id", + "categories/name", + "categories/supercategory", + "images", + "images/file_name", + "images/height", + "images/id", + "images/width", + "info", + "licenses", + ], + ), + ], +) +def test_extract_properties_keys( + schema: dict, + expected_properties_keys: list, + request: pytest.FixtureRequest, +): + """Test the _extract_properties_keys helper function.""" + schema = request.getfixturevalue(schema) + assert _extract_properties_keys(schema) == sorted(expected_properties_keys) + + +@pytest.mark.parametrize( + ( + "list_required_keys, input_dict, additional_message, " + "expected_exception, expected_message" + ), + [ + ( + ["images", "annotations", "categories"], + { + "images": [1, 2, 3], + "annotations": [1, 2, 3], + "categories": [1, 2, 3], + }, + "", + does_not_raise(), + "", + ), # zero missing keys, and all keys map to non-empty values + ( + ["images", "annotations", "categories"], + { + "images": [], + "annotations": [1, 2, 3], + "categories": [1, 2, 3], + }, + "", + pytest.raises(ValueError), + "Empty value(s) found for the required key(s) ['images'].", + ), # zero missing keys, but one ("images") maps to empty values + ( + ["images", "annotations", "categories"], + { + "images": [], + "annotations": {}, + "categories": [1, 2, 3], + }, + "", + pytest.raises(ValueError), + ( + "Empty value(s) found for the required key(s) " + "['annotations', 'images']." + ), + ), # zero missing keys, but two keys map to empty values + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "", + pytest.raises(ValueError), + "Required key(s) ['images'] not found.", + ), # one missing key + ( + ["images", "annotations", "categories"], + {"annotations": ""}, + "", + pytest.raises(ValueError), + "Required key(s) ['categories', 'images'] not found.", + ), # two missing keys + ( + ["images", "annotations", "categories"], + {"annotations": "", "categories": ""}, + "FOO", + pytest.raises(ValueError), + "Required key(s) ['images'] not foundFOO.", + ), # one missing key with additional message for missing keys + ], +) +def test_check_required_keys_in_dict( + list_required_keys: list, + input_dict: dict, + additional_message: str, + expected_exception: pytest.raises, + expected_message: str, +): + """Test the _check_required_keys_in_dict helper function. + + The check verifies that the required keys are defined in the input + dictionary and if they are, it checks that they do not map to empty + values. + """ + with expected_exception as excinfo: + _check_required_keys_in_dict( + list_required_keys, input_dict, additional_message + ) + + # Check error message + if excinfo: + assert expected_message in str(excinfo.value) + + +def test_check_required_properties_keys(small_schema: dict): + """Test the _check_required_keys helper function.""" + # Define a sample schema from "small_schema" + # with a "properties" key missing (e.g. "c/c2") + small_schema["properties"]["c"]["properties"].pop("c2") + + # Define required "properties" keys + required_keys = ["a", "b", "c/c2"] + + # Run check + with pytest.raises(ValueError) as excinfo: + _check_required_properties_keys(required_keys, small_schema) + + # Check error message + assert "Required key(s) ['c/c2'] not found in schema" in str(excinfo.value) + + +@pytest.mark.parametrize( + "input_file,", + [ + "VIA_JSON_sample_1.json", + "VIA_JSON_sample_2.json", + ], +) +def test_required_keys_in_provided_VIA_schema( + input_file: str, default_VIA_schema: dict, annotations_test_data: dict +): + """Check the provided VIA schema contains the ValidVIA required keys.""" + # Get required keys from a VIA valid file + filepath = annotations_test_data[input_file] + valid_VIA = ValidVIA(path=filepath) + required_VIA_keys = valid_VIA.required_keys + + # Map required keys to "properties" keys in schema + map_required_to_properties_keys = { + "main": "", + "images": "_via_img_metadata", + "regions": "_via_img_metadata/regions", + "shape_attributes": "_via_img_metadata/regions/shape_attributes", + } + + # Express required keys as required "properties" keys + required_property_keys = [ + val if ky == "main" else f"{map_required_to_properties_keys[ky]}/{val}" + for ky, values in required_VIA_keys.items() + for val in values + ] + + # Run check + _check_required_properties_keys( + required_property_keys, + default_VIA_schema, + ) + + +@pytest.mark.parametrize( + "input_file,", + [ + "COCO_JSON_sample_1.json", + "COCO_JSON_sample_2.json", + ], +) +def test_required_keys_in_provided_COCO_schema( + input_file: str, default_COCO_schema: dict, annotations_test_data: dict +): + """Check the provided COCO schema contains the ValidCOCO required keys.""" + # Get required keys from a COCO valid file + filepath = annotations_test_data[input_file] + valid_COCO = ValidCOCO(path=filepath) + required_COCO_keys = valid_COCO.required_keys + + # Prepare list of required "properties" keys with full paths + required_properties_keys = [ + f"{level}/{ky}" if level != "main" else ky + for level, required_keys in required_COCO_keys.items() + for ky in required_keys + ] + + # Run check + _check_required_properties_keys( + required_properties_keys, + default_COCO_schema, + )