hubmapconsortium · tkakar · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.3.8
+0.3.9
diff --git a/src/portal_visualization/builders/imaging_builders.py b/src/portal_visualization/builders/imaging_builders.py
@@ -1,3 +1,7 @@
+from .base_builders import ViewConfBuilder
+from ..paths import (IMAGE_PYRAMID_DIR, OFFSETS_DIR, SEQFISH_HYB_CYCLE_REGEX,
+                     SEQFISH_FILE_REGEX, SEGMENTATION_SUPPORT_IMAGE_SUBDIR,
+                     SEGMENTATION_SUBDIR, IMAGE_METADATA_DIR)
 from pathlib import Path
 import re
 
@@ -12,11 +16,10 @@
     Component as cm,
 )
 
-from ..utils import get_matches, group_by_file_name, get_conf_cells, get_found_images
-from ..paths import (IMAGE_PYRAMID_DIR, OFFSETS_DIR, SEQFISH_HYB_CYCLE_REGEX,
-                     SEQFISH_FILE_REGEX, SEGMENTATION_SUPPORT_IMAGE_SUBDIR,
-                     SEGMENTATION_SUBDIR)
-from .base_builders import ViewConfBuilder
+from ..utils import get_matches, group_by_file_name, get_conf_cells, get_found_images, \
+    get_found_images_all, get_image_scale, get_image_metadata
+
+from ..constants import base_image_dirs
 
 BASE_IMAGE_VIEW_TYPE = 'image'
 SEG_IMAGE_VIEW_TYPE = 'seg'
@@ -30,6 +33,7 @@ def __init__(self, entity, groups_token, assets_endpoint, **kwargs):
         self.use_full_resolution = []
         self.use_physical_size_scaling = False
         self.view_type = BASE_IMAGE_VIEW_TYPE
+        self.base_image_metadata = None
         super().__init__(entity, groups_token, assets_endpoint, **kwargs)
 
     def _get_img_and_offset_url(self, img_path, img_dir):
@@ -49,7 +53,8 @@ def _get_img_and_offset_url(self, img_path, img_dir):
         ...   assets_endpoint='https://example.com')
         >>> pprint(builder._get_img_and_offset_url("rel_path/to/clusters.ome.tiff", "rel_path/to"))
         ('https://example.com/uuid/rel_path/to/clusters.ome.tiff?token=groups_token',\n\
-         'https://example.com/uuid/output_offsets/clusters.offsets.json?token=groups_token')
+         'https://example.com/uuid/output_offsets/clusters.offsets.json?token=groups_token',\n\
+         'https://example.com/uuid/image_metadata/clusters.metadata.json?token=groups_token')
 
         """
         img_url = self._build_assets_url(img_path)
@@ -62,6 +67,13 @@ def _get_img_and_offset_url(self, img_path, img_dir):
                     re.sub(img_dir, OFFSETS_DIR, img_url),
                 )
             ),
+            str(
+                re.sub(
+                    r"ome\.tiff?",
+                    "metadata.json",
+                    re.sub(img_dir, IMAGE_METADATA_DIR, img_url),
+                )
+            ),
         )
 
     def _get_img_and_offset_url_seg(self, img_path, img_dir):
@@ -74,6 +86,7 @@ def _get_img_and_offset_url_seg(self, img_path, img_dir):
         """
         img_url = self._build_assets_url(img_path)
         offsets_path = re.sub(IMAGE_PYRAMID_DIR, OFFSETS_DIR, img_dir)
+        metadata_path = re.sub(IMAGE_PYRAMID_DIR, IMAGE_METADATA_DIR, img_dir)
         return (
             img_url,
             str(
@@ -83,6 +96,13 @@ def _get_img_and_offset_url_seg(self, img_path, img_dir):
                     re.sub(img_dir, offsets_path, img_url),
                 )
             ),
+            str(
+                re.sub(
+                    r"ome\.tiff?",
+                    "metadata.json",
+                    re.sub(img_dir, metadata_path, img_url),
+                )
+            ),
         )
 
     def _add_segmentation_image(self, dataset):
@@ -95,18 +115,24 @@ def _add_segmentation_image(self, dataset):
         except Exception as e:
             raise RuntimeError(f"Error while searching for segmentation images: {e}")
 
-        filtered_images = [img for img in found_images if SEGMENTATION_SUPPORT_IMAGE_SUBDIR not in img]
+        filtered_images = [
+            img for img in found_images
+            if not any(subdir in img for subdir in base_image_dirs)
+        ]
 
         if not filtered_images:
             raise FileNotFoundError(f"Segmentation assay with uuid {self._uuid} has no matching files")
 
-        img_url, offsets_url = self._get_img_and_offset_url(filtered_images[0], self.seg_image_pyramid_regex)
+        img_url, offsets_url, metadata_url = self._get_img_and_offset_url(
+            filtered_images[0], self.seg_image_pyramid_regex)
+        seg_meta_data = get_image_metadata(self, metadata_url)
+
+        scale = get_image_scale(self.base_image_metadata, seg_meta_data)
         if dataset is not None:
             dataset.add_object(
                 ObsSegmentationsOmeTiffWrapper(img_url=img_url, offsets_url=offsets_url,
                                                obs_types_from_channel_names=True,
-                                               # coordinate_transformations=[{"type": "scale", "scale":
-                                               # [0.377.,0.377,1,1,1]}] # need to read from a file
+                                               coordinate_transformations=[{"type": "scale", "scale": scale}]
                                                )
             )
 
@@ -148,7 +174,9 @@ def get_conf_cells_common(self, get_img_and_offset_url_func, **kwargs):
         dataset = vc.add_dataset(name="Visualization Files")
 
         if 'seg' in self.view_type:
-            img_url, offsets_url = get_img_and_offset_url_func(found_images[0], self.image_pyramid_regex)
+            img_url, offsets_url, metadata_url = get_img_and_offset_url_func(found_images[0], self.image_pyramid_regex)
+            meta_data = get_image_metadata(self, metadata_url)
+            self.base_image_metadata = meta_data
             dataset = dataset.add_object(
                 ImageOmeTiffWrapper(img_url=img_url, offsets_url=offsets_url, name=Path(found_images[0]).name)
             )
@@ -161,7 +189,7 @@ def get_conf_cells_common(self, get_img_and_offset_url_func, **kwargs):
                     img_url=img_url, offsets_url=offsets_url, name=Path(img_path).name
                 )
                 for img_path in found_images
-                for img_url, offsets_url in [get_img_and_offset_url_func(img_path, self.image_pyramid_regex)]
+                for img_url, offsets_url, _ in [get_img_and_offset_url_func(img_path, self.image_pyramid_regex)]
             ]
             dataset.add_object(
                 MultiImageWrapper(images, use_physical_size_scaling=self.use_physical_size_scaling)
@@ -216,10 +244,21 @@ class KaggleSegImagePyramidViewConfBuilder(AbstractImagingViewConfBuilder):
 
     def __init__(self, entity, groups_token, assets_endpoint, **kwargs):
         super().__init__(entity, groups_token, assets_endpoint, **kwargs)
-        self.image_pyramid_regex = f"{IMAGE_PYRAMID_DIR}/{SEGMENTATION_SUPPORT_IMAGE_SUBDIR}"
         self.seg_image_pyramid_regex = IMAGE_PYRAMID_DIR
         self.view_type = KAGGLE_IMAGE_VIEW_TYPE
 
+        # Needed to adjust to various directory structures. For older datasets, the image pyramids will be present in
+        # 'processed_microscopy' or 'processedMicroscopy' while newer datasets are listed under lab_processed.
+
+        image_dir = SEGMENTATION_SUPPORT_IMAGE_SUBDIR
+        file_paths_found = self._get_file_paths()
+        paths = get_found_images_all(file_paths_found)
+        matched_dirs = {dir for dir in base_image_dirs if any(dir in img for img in paths)}
+
+        image_dir = next(iter(matched_dirs), image_dir)
+
+        self.image_pyramid_regex = f"{IMAGE_PYRAMID_DIR}/{image_dir}"
+
     def get_conf_cells(self, **kwargs):
         return self.get_conf_cells_common(self._get_img_and_offset_url_seg, **kwargs)
 
@@ -278,7 +317,7 @@ def get_conf_cells(self, **kwargs):
             dataset = vc.add_dataset(name=pos_name)
             sorted_images = sorted(images, key=self._get_hybcycle)
             for img_path in sorted_images:
-                img_url, offsets_url = self._get_img_and_offset_url(
+                img_url, offsets_url, _ = self._get_img_and_offset_url(
                     img_path, IMAGE_PYRAMID_DIR
                 )
                 image_wrappers.append(

diff --git a/src/portal_visualization/builders/sprm_builders.py b/src/portal_visualization/builders/sprm_builders.py
@@ -67,7 +67,7 @@ def _get_ometiff_image_wrapper(self, found_image_file, found_image_path):
         :param str found_image_file: The path to look for the image itself
         :param str found_image_path: The folder to be replaced with the offsets path
         """
-        img_url, offsets_url = self._get_img_and_offset_url(
+        img_url, offsets_url, _ = self._get_img_and_offset_url(
             found_image_file, re.escape(found_image_path),
         )
         return OmeTiffWrapper(
@@ -172,7 +172,7 @@ def _get_bitmask_image_path(self):
         return f"{self._mask_path_regex}/{self._mask_name}" + r"\.ome\.tiff?"
 
     def _get_ometiff_mask_wrapper(self, found_bitmask_file):
-        bitmask_img_url, bitmask_offsets_url = self._get_img_and_offset_url(
+        bitmask_img_url, bitmask_offsets_url, _ = self._get_img_and_offset_url(
             found_bitmask_file, self.image_pyramid_regex,
         )
         return OmeTiffWrapper(

diff --git a/src/portal_visualization/constants.py b/src/portal_visualization/constants.py
@@ -0,0 +1,13 @@
+# Units used in the image metadata for physical sizes
+image_units = {
+    "nm": 1e9,
+    "μm": 1e6,
+    "mm": 1e3,
+    "cm": 1e2,
+    "dm": 10
+}
+
+# The base image pyramids for kaggle-1 and kaggle-2 may have various directory structures depending
+#  upon when they were processed. For older datasets, the image pyramids will be present
+#  either in 'processed_microscopy', or 'processedMicroscopy' while newer datasets will be listed under lab_processed.
+base_image_dirs = ['lab_processed', 'processed_microscopy', 'processedMicroscopy']
diff --git a/src/portal_visualization/paths.py b/src/portal_visualization/paths.py
@@ -15,3 +15,4 @@
 SEGMENTATION_SUBDIR = "extras/transformations"
 SEGMENTATION_ZARR_STORES = "hubmap_ui/seg-to-mudata-zarr/objects.zarr"
 SEGMENTATION_SUPPORT_IMAGE_SUBDIR = "lab_processed/images"
+IMAGE_METADATA_DIR = "image_metadata"
diff --git a/src/portal_visualization/utils.py b/src/portal_visualization/utils.py
@@ -1,11 +1,14 @@
 from pathlib import Path
 import re
 from itertools import groupby
+from requests import get
+from unicodedata import normalize
 
 import nbformat
 from vitessce import VitessceConfig
 
 from .builders.base_builders import ConfCells
+from .constants import image_units
 
 
 def get_matches(files, regex):
@@ -82,6 +85,171 @@ def get_found_images(image_pyramid_regex, file_paths_found):
     return found_images
 
 
+def get_found_images_all(file_paths_found):
+    found_images = [
+        path for path in get_matches(
+            file_paths_found, r".*\.ome\.tiff?$",
+        )
+        if 'separate/' not in path
+    ]
+    return found_images
+
+
+def get_image_metadata(self, img_url):
+    """
+    Retrieve metadata from an image URL.
+    >>> import builtins
+    >>> from unittest.mock import Mock, patch
+    >>> mock_instance = Mock()
+    >>> mock_instance._get_request_init.return_value = {}
+    >>> mock_response = Mock()
+    >>> mock_response.status_code = 404
+    >>> mock_response.reason = 'Not Found'
+    >>> with patch('requests.get', return_value=mock_response):
+    ...     with patch.object(builtins, 'print') as mock_print:
+    ...         result = get_image_metadata(mock_instance, 'https://example.com/image')
+    ...         mock_print.assert_called_with(f"Failed to retrieve https://example.com/image: 404 - Not Found")
+    ...         assert result is None
+    """
+
+    meta_data = None
+    request_init = self._get_request_init() or {}
+    response = get(img_url, **request_init)
+    if response.status_code == 200:  # pragma no cover
+        data = response.json()
+        if isinstance(data, dict) and "PhysicalSizeX" in data and 'PhysicalSizeUnitX' in data:
+            meta_data = data
+        else:
+            print("Image does not have metadata")
+    else:
+        print(f"Failed to retrieve {img_url}: {response.status_code} - {response.reason}")
+    return meta_data
+
+
+def get_image_scale(base_metadata, seg_metadata):
+    """
+    Computes the scale between two image metadata based on physical size.
+
+    Args:
+        base_metadata (dict): Metadata for the base image.
+        seg_metadata (dict): Metadata for the segmented image.
+
+    Returns:
+        list: A list containing the scale factors for x, y, while keeping others unchanged (as 1).
+
+    Doctest:
+    >>> from unittest.mock import Mock, patch
+    >>> import builtins
+    >>> base_metadata = { \
+        'PhysicalSizeX': 50, 'PhysicalSizeY': 100, 'PhysicalSizeUnitX': 'mm', 'PhysicalSizeUnitY': 'mm' \
+    }
+    >>> seg_metadata = { \
+        'PhysicalSizeX': 25, 'PhysicalSizeY': 50, 'PhysicalSizeUnitX': 'mm', 'PhysicalSizeUnitY': 'mm' \
+    }
+    >>> with patch('builtins.print') as mock_print:
+    ...     scale = get_image_scale(base_metadata, seg_metadata)
+    ...     mock_print.assert_called_with("Scaling factor: ", [2.0, 2.0, 1, 1, 1])
+    ...     assert scale == [2.0, 2.0, 1, 1, 1]  # Ensure the return value is also correct
+
+    >>> base_metadata = { \
+        'PhysicalSizeX': 50, 'PhysicalSizeY': 100, 'PhysicalSizeUnitX': 'mm', 'PhysicalSizeUnitY': 'mm' \
+    }
+    >>> seg_metadata = None
+    >>> with patch('builtins.print') as mock_print:
+    ...     scale = get_image_scale(base_metadata, seg_metadata)
+    ...     mock_print.assert_called_with("Scaling factor: ", [1, 1, 1, 1, 1])
+    ...     assert scale == [1, 1, 1, 1, 1]  # Ensure the return value is also correct
+    """
+
+    scale = [1, 1, 1, 1, 1]
+    seg_x, seg_y, seg_x_unit, seg_y_unit = None, None, None, None
+    base_x, base_y, base_x_unit, base_y_unit = None, None, None, None
+
+    if seg_metadata is not None:
+        seg_x, seg_y, seg_x_unit, seg_y_unit = get_physical_size_units(seg_metadata)
+
+    if base_metadata is not None:
+        base_x, base_y, base_x_unit, base_y_unit = get_physical_size_units(base_metadata)
+
+    if all([base_x_unit, base_y_unit, seg_x_unit, seg_y_unit]) and \
+            all([unit in image_units for unit in [base_x_unit, base_y_unit, seg_x_unit, seg_y_unit]]):
+        scale_x = (base_x / seg_x) * (image_units[seg_x_unit] / image_units[base_x_unit])
+        scale_y = (base_y / seg_y) * (image_units[seg_y_unit] / image_units[base_y_unit])
+
+        scale = [scale_x, scale_y, 1, 1, 1]
+    else:
+        print("PhysicalSize units are not correct")
+    print("Scaling factor: ", scale)
+    return scale
+
+
+def get_physical_size_units(metadata):
+    """
+        Extracts the physical size units (X, Y) from metadata.
+
+        Args:
+            metadata (dict): The metadata dictionary for the image.
+
+        Returns:
+            tuple: A tuple containing the physical sizes and their respective units.
+
+        Doctest:
+
+        >>> metadata = { \
+            'PhysicalSizeX': 50, 'PhysicalSizeY': 100, 'PhysicalSizeUnitX': 'mm', 'PhysicalSizeUnitY': 'mm' \
+        }
+        >>> get_physical_size_units(metadata)
+        (50, 100, 'mm', 'mm')
+
+        >>> metadata = { \
+            'PhysicalSizeX': None, 'PhysicalSizeY': 100, 'PhysicalSizeUnitX': 'mm', 'PhysicalSizeUnitY': 'mm' \
+        }
+        >>> get_physical_size_units(metadata)
+        (1, 100, 'mm', 'mm')
+    """
+
+    # size_x and size_y will be one if nothing is provided
+    size_x = metadata['PhysicalSizeX'] if metadata['PhysicalSizeX'] is not None else 1
+    size_y = metadata['PhysicalSizeY'] if metadata['PhysicalSizeY'] is not None else 1
+    size_x_unit = convert_unicode_unit(metadata, 'PhysicalSizeUnitX')
+    size_y_unit = convert_unicode_unit(metadata, 'PhysicalSizeUnitY')
+
+    return size_x, size_y, size_x_unit, size_y_unit
+
+
+def convert_unicode_unit(metadata, key):
+    """
+        Converts any unicode string (e.g., representing image units) in the metadata key to a normalized format.
+
+        Args:
+            metadata (dict): The metadata dictionary containing the key.
+            key (str): The key for the unit (e.g., 'PhysicalSizeUnitX').
+
+        Returns:
+            str or None: The normalized unit as a string, or None if not found.
+
+        Doctest:
+
+        >>> metadata = {'PhysicalSizeUnitX': 'mm'}
+        >>> convert_unicode_unit(metadata, 'PhysicalSizeUnitX')
+        'mm'
+
+        >>> metadata = {'PhysicalSizeUnitY': '\u00b5m'}
+        >>> convert_unicode_unit(metadata, 'PhysicalSizeUnitY')
+        'μm'
+
+        >>> metadata = {'PhysicalSizeUnitY': None}
+        >>> convert_unicode_unit(metadata, 'PhysicalSizeUnitY')
+    """
+    # Check if the key exists and if the value is a string
+    if key in metadata and isinstance(metadata[key], str):
+        # Normalize the unicode string
+        return normalize('NFKC', metadata[key])
+
+    # Return None if the key is not present or the value isn't a string
+    return None
+
+
 def files_from_response(response_json):
     '''
     >>> response_json = {'hits': {'hits': [