From fbdc3cd07b282c6b00ad04b940752026508d2d99 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Wed, 19 Nov 2025 14:41:48 -0800 Subject: [PATCH 1/4] feat: Added warnings when validating datasets for potentially unreachable files --- colorizer_data/utils.py | 29 +++++++++++++++++++++++++++++ colorizer_data/writer.py | 14 ++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/colorizer_data/utils.py b/colorizer_data/utils.py index 18fc6f1..97fb99d 100644 --- a/colorizer_data/utils.py +++ b/colorizer_data/utils.py @@ -776,3 +776,32 @@ def _get_frame_count_from_3d_source(source: str) -> int: # Attempt to read the image to get info (such as length) img = BioImage(source) return int(img.dims.T) + + +def is_url(source: str) -> bool: + """ + Checks if a source string is an HTTP(S) URL. + """ + return source.startswith("http://") or source.startswith("https://") + + +def check_file_source(name: str, source: str | None, outpath: pathlib.Path): + """ + Logs warnings for missing or unreachable file sources. + """ + if source is None: + logging.error(f"{name} is undefined.") + elif not is_url(source): + # Check for absolute paths, parent paths, or missing files/folders. + if os.path.isabs(source): + logging.error( + f"{name} must be a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'" + ) + elif ".." in pathlib.Path(source).parts: + logging.warning( + f"{name} should not contain parent directory references ('..'), as it may fail to load in certain deploy environments. Received: '{source}'" + ) + elif not os.path.exists(outpath / source): + logging.warning( + f"{name} path could not be found. Please check that it exists. Received: '{source}'" + ) diff --git a/colorizer_data/writer.py b/colorizer_data/writer.py index 086610f..87d1084 100644 --- a/colorizer_data/writer.py +++ b/colorizer_data/writer.py @@ -22,6 +22,7 @@ DEFAULT_FRAME_PREFIX, DEFAULT_FRAME_SUFFIX, _get_frame_count_from_3d_source, + check_file_source, cast_feature_to_info_type, copy_remote_or_local_file, generate_frame_paths, @@ -711,3 +712,16 @@ def validate_dataset( + " or add an offset if your frame numbers do not start at 0." + " You may also need to generate the list of frames yourself if your dataset is skipping frames." ) + + # Check that frames3d sources are reachable + if "frames3d" in self.manifest: + frames3d_metadata = Frames3dMetadata.from_dict(self.manifest["frames3d"]) + source = frames3d_metadata.source + check_file_source("3D frames source", source, self.outpath) + # Validate backdrops + if frames3d_metadata["backdrops"] is not None: + for i in range(len(frames3d_metadata["backdrops"])): + backdrop_source = frames3d_metadata["backdrops"][i]["source"] + check_file_source( + f"3D frames backdrop {i} source", backdrop_source, self.outpath + ) From 4c78381697e7971dc77cffce2bb6d9c245dbaaa1 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Wed, 19 Nov 2025 14:58:25 -0800 Subject: [PATCH 2/4] fix: Unit tests --- colorizer_data/writer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/colorizer_data/writer.py b/colorizer_data/writer.py index 87d1084..3a62929 100644 --- a/colorizer_data/writer.py +++ b/colorizer_data/writer.py @@ -719,9 +719,9 @@ def validate_dataset( source = frames3d_metadata.source check_file_source("3D frames source", source, self.outpath) # Validate backdrops - if frames3d_metadata["backdrops"] is not None: - for i in range(len(frames3d_metadata["backdrops"])): - backdrop_source = frames3d_metadata["backdrops"][i]["source"] + if frames3d_metadata.backdrops is not None: + for i in range(len(frames3d_metadata.backdrops)): + backdrop_source = frames3d_metadata.backdrops[i].source check_file_source( f"3D frames backdrop {i} source", backdrop_source, self.outpath ) From 7611aab42d238dc003d438606837b7d4a1954c49 Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 20 Nov 2025 15:28:40 -0800 Subject: [PATCH 3/4] refactor: Update comments, warning messages --- colorizer_data/converter.py | 3 ++- colorizer_data/types.py | 4 ++-- colorizer_data/utils.py | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/colorizer_data/converter.py b/colorizer_data/converter.py index 9d3005c..df44b01 100644 --- a/colorizer_data/converter.py +++ b/colorizer_data/converter.py @@ -443,7 +443,8 @@ def convert_colorizer_data( be flattened along the Z-axis using a max projection. If `None`, 2D frame generation will be skipped. frames_3d (Frames3dMetadata | None): A `Frames3dMetadata` object containing the 3D image source - ("source") and channel ("segmentation_channel") to use for the 3D image source. + ("source") and channel ("segmentation_channel") to use for the 3D image source. The source + should be the path to or the URL of an OME-Zarr array (preferred) or OME-TIFF file. centroid_x_column (str): The name of the column containing x-coordinates of object centroids, in pixels relative to the frame image, where 0 is the left edge of the image. Defaults to "Centroid X." diff --git a/colorizer_data/types.py b/colorizer_data/types.py index 2c3e893..b651d60 100644 --- a/colorizer_data/types.py +++ b/colorizer_data/types.py @@ -155,8 +155,8 @@ class Frames3dMetadata(DataClassJsonMixin): source: str """ - HTTPS or local path to 3D data, ideally in OME-Zarr format (e.g. ends with - `.ome.zarr`). + HTTPS or relative path from the dataset directory to 3D data, ideally in + OME-Zarr format (e.g. ends with `.ome.zarr`). """ segmentation_channel: int = 0 """The channel of segmentation data. `0` by default.""" diff --git a/colorizer_data/utils.py b/colorizer_data/utils.py index 97fb99d..ad9c224 100644 --- a/colorizer_data/utils.py +++ b/colorizer_data/utils.py @@ -790,12 +790,14 @@ def check_file_source(name: str, source: str | None, outpath: pathlib.Path): Logs warnings for missing or unreachable file sources. """ if source is None: - logging.error(f"{name} is undefined.") + logging.error( + f"{name} is undefined and will fail to load. Please provide a relative path inside the dataset directory or an HTTP(S) URL to an OME-Zarr (preferred) or OME-TIFF file." + ) elif not is_url(source): # Check for absolute paths, parent paths, or missing files/folders. if os.path.isabs(source): logging.error( - f"{name} must be a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'" + f"{name} cannot be an absolute path and will fail to load. Please provide a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'" ) elif ".." in pathlib.Path(source).parts: logging.warning( From 15af659871aaf4a68a83ec3b2b486e5030d9da9a Mon Sep 17 00:00:00 2001 From: Peyton Lee Date: Thu, 20 Nov 2025 15:31:15 -0800 Subject: [PATCH 4/4] refactor: Update type comments on source paths --- colorizer_data/types.py | 16 ++++++++++++---- colorizer_data/utils.py | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/colorizer_data/types.py b/colorizer_data/types.py index b651d60..151f52f 100644 --- a/colorizer_data/types.py +++ b/colorizer_data/types.py @@ -128,9 +128,13 @@ class Backdrop3dMetadata(DataClassJsonMixin): name: str source: str """ - HTTPS or local path to an OME-Zarr source volume (e.g. ends with - `.ome.zarr`). Can be the same source as the segmentations defined in - `Frames3dMetadata`. + HTTPS URL or relative path from the dataset directory to an OME-Zarr source + volume (e.g. ends with `.ome.zarr`). Can be the same source as the + segmentations defined in `Frames3dMetadata`. + + Example: + - `volumes/sample_data.ome.zarr` + - `https://example.com/data/sample_data.ome.zarr` """ channel_index: str """ @@ -155,8 +159,12 @@ class Frames3dMetadata(DataClassJsonMixin): source: str """ - HTTPS or relative path from the dataset directory to 3D data, ideally in + HTTPS URL or relative path from the dataset directory to 3D data, ideally in OME-Zarr format (e.g. ends with `.ome.zarr`). + + Example: + - `volumes/sample_data.ome.zarr` + - `https://example.com/data/sample_data.ome.zarr` """ segmentation_channel: int = 0 """The channel of segmentation data. `0` by default.""" diff --git a/colorizer_data/utils.py b/colorizer_data/utils.py index ad9c224..aba30dc 100644 --- a/colorizer_data/utils.py +++ b/colorizer_data/utils.py @@ -791,13 +791,13 @@ def check_file_source(name: str, source: str | None, outpath: pathlib.Path): """ if source is None: logging.error( - f"{name} is undefined and will fail to load. Please provide a relative path inside the dataset directory or an HTTP(S) URL to an OME-Zarr (preferred) or OME-TIFF file." + f"{name} is undefined and will fail to load. Please provide a relative path inside the dataset directory or an HTTPS URL to an OME-Zarr (preferred) or OME-TIFF file." ) elif not is_url(source): # Check for absolute paths, parent paths, or missing files/folders. if os.path.isabs(source): logging.error( - f"{name} cannot be an absolute path and will fail to load. Please provide a relative path inside the dataset directory or an HTTP(S) URL. Received: '{source}'" + f"{name} cannot be an absolute path and will fail to load. Please provide a relative path inside the dataset directory or an HTTPS URL. Received: '{source}'" ) elif ".." in pathlib.Path(source).parts: logging.warning(