Merge pull request #85 from astronomy-commons/sean/catalog-refactor

Refactor Catalogs to inherit from base Dataset and not initialize from files
astronomy-commons · Apr 24, 2023 · bdde9aa · bdde9aa
2 parents 18c389d + e8581e4
commit bdde9aa
Show file tree

Hide file tree

Showing 21 changed files with 543 additions and 81 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,8 @@ dependencies = [
     "setuptools_scm",
     "pyarrow",
     "astropy",
-    "regions"
+    "regions",
+    "typing-extensions"
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)

diff --git a/src/hipscat/catalog/__init__.py b/src/hipscat/catalog/__init__.py
@@ -2,4 +2,5 @@
 
 from .catalog import Catalog
 from .catalog_parameters import CatalogParameters
+from .catalog_type import CatalogType
 from .partition_info import PartitionInfo
diff --git a/src/hipscat/catalog/catalog.py b/src/hipscat/catalog/catalog.py
@@ -1,43 +1,71 @@
 """Container class to hold catalog metadata and partition iteration"""
+from __future__ import annotations
 
+from typing import Tuple, Union
 
-from hipscat.catalog.catalog_parameters import read_from_metadata_file
-from hipscat.catalog.partition_info import PartitionInfo
-from hipscat.io import file_io, paths
+import pandas as pd
 
+from hipscat.catalog.catalog_info import CatalogInfo
+from hipscat.catalog.catalog_type import CatalogType
+from hipscat.catalog.dataset.dataset import Dataset
+from hipscat.catalog.partition_info import PartitionInfo
+from hipscat.io import FilePointer, file_io, paths
+from hipscat.pixel_tree.pixel_tree import PixelTree
+from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder
 
-class Catalog:
-    """Container class for catalog metadata"""
 
-    def __init__(self, catalog_path: str = None) -> None:
-        self.catalog_path = catalog_path
-        self.catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path)
-        self.metadata_keywords = None
+class Catalog(Dataset):
+    """A HiPSCat Catalog with data stored in a HEALPix Hive partitioned structure
 
-        self.partition_info = None
-        self.catalog_info = None
+    Catalogs of this type are partitioned spatially, contain `partition_info` metadata specifying
+    the pixels in Catalog, and on disk conform to the parquet partitioning structure
+    `Norder=/Dir=/Npix=.parquet`
+    """
 
-        self.catalog_name = None
-        self.catalog_type = None
+    CatalogInfoClass = CatalogInfo
+    PixelInputTypes = Union[pd.DataFrame, PartitionInfo]
+    HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE, CatalogType.MARGIN]
 
-        self._initialize_metadata()
+    def __init__(
+        self,
+        catalog_info: CatalogInfoClass,
+        pixels: PixelInputTypes,
+        catalog_path=None,
+    ) -> None:
+        """Initializes a Catalog
 
-    def _initialize_metadata(self):
-        if not file_io.does_file_or_directory_exist(self.catalog_base_dir):
-            raise FileNotFoundError(
-                f"No directory exists at {str(self.catalog_base_dir)}"
-            )
-        catalog_info_file = paths.get_catalog_info_pointer(self.catalog_base_dir)
-        if not file_io.does_file_or_directory_exist(catalog_info_file):
-            raise FileNotFoundError(
-                f"No catalog info found where expected: {str(catalog_info_file)}"
+        Args:
+            catalog_info: CatalogInfo object with catalog metadata
+            pixels: Specifies the pixels contained in the catalog. Can be either a Dataframe with
+                columns `Norder`, `Dir`, and `Npix` matching a `partition_info.csv` file, or a
+                PartitionInfo object
+            catalog_path: If the catalog is stored on disk, specify the location of the catalog
+                Does not load the catalog from this path, only store as metadata
+        """
+        if catalog_info.catalog_type not in self.HIPS_CATALOG_TYPES:
+            raise ValueError(
+                f"Catalog info `catalog_type` must be one of "
+                f"{', '.join([t.value for t in self.HIPS_CATALOG_TYPES])}"
             )
-        self.catalog_info = read_from_metadata_file(catalog_info_file)
-        self.catalog_name = self.catalog_info.catalog_name
-        self.catalog_type = self.catalog_info.catalog_type
+        super().__init__(catalog_info, catalog_path)
+        self.partition_info = self._get_partition_info_from_pixels(pixels)
+        self.pixel_tree = self._get_pixel_tree_from_pixels(pixels)
 
-        if self.catalog_type in ("object", "source"):
-            self.partition_info = PartitionInfo(self.catalog_base_dir)
+    @staticmethod
+    def _get_partition_info_from_pixels(pixels: PixelInputTypes) -> PartitionInfo:
+        if isinstance(pixels, PartitionInfo):
+            return pixels
+        if isinstance(pixels, pd.DataFrame):
+            return PartitionInfo(pixels)
+        raise TypeError("Pixels must be of type PartitionInfo or Dataframe")
+
+    @staticmethod
+    def _get_pixel_tree_from_pixels(pixels: PixelInputTypes) -> PixelTree:
+        if isinstance(pixels, PartitionInfo):
+            return PixelTreeBuilder.from_partition_info_df(pixels.data_frame)
+        if isinstance(pixels, pd.DataFrame):
+            return PixelTreeBuilder.from_partition_info_df(pixels)
+        raise TypeError("Pixels must be of type PartitionInfo or Dataframe")
 
     def get_pixels(self):
         """Get all healpix pixels that are contained in the catalog
@@ -52,3 +80,19 @@ def get_pixels(self):
             - num_objects: the number of rows in the pixel's partition
         """
         return self.partition_info.data_frame
+
+    @classmethod
+    def _read_args(cls, catalog_base_dir: FilePointer) -> Tuple[CatalogInfoClass, PartitionInfo]:
+        args = super()._read_args(catalog_base_dir)
+        partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
+        partition_info = PartitionInfo.read_from_file(partition_info_file)
+        return args + (partition_info,)
+
+    @classmethod
+    def _check_files_exist(cls, catalog_base_dir: FilePointer):
+        super()._check_files_exist(catalog_base_dir)
+        partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
+        if not file_io.does_file_or_directory_exist(partition_info_file):
+            raise FileNotFoundError(
+                f"No partition info found where expected: {str(partition_info_file)}"
+            )
diff --git a/src/hipscat/catalog/catalog_info.py b/src/hipscat/catalog/catalog_info.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+
+from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
+
+
+@dataclass
+class CatalogInfo(BaseCatalogInfo):
+    """Catalog Info for a HEALPix Hive partitioned Catalog"""
+    epoch: str = "J2000"
+    ra_column: str = "ra"
+    dec_column: str = "dec"
+
+    required_fields = BaseCatalogInfo.required_fields + ["epoch", "ra_column", "dec_column"]
diff --git a/src/hipscat/catalog/catalog_type.py b/src/hipscat/catalog/catalog_type.py
@@ -0,0 +1,11 @@
+from enum import Enum
+
+
+class CatalogType(str, Enum):
+    """Enum for possible types of catalog"""
+
+    OBJECT = "object"
+    SOURCE = "source"
+    ASSOCIATION = "association"
+    INDEX = "index"
+    MARGIN = "margin"
diff --git a/src/hipscat/catalog/dataset/base_catalog_info.py b/src/hipscat/catalog/dataset/base_catalog_info.py
@@ -0,0 +1,59 @@
+import dataclasses
+from dataclasses import dataclass
+
+from typing_extensions import Self
+
+from hipscat.catalog.catalog_type import CatalogType
+from hipscat.io import FilePointer, file_io
+
+
+@dataclass
+class BaseCatalogInfo:
+    """Container class for catalog metadata"""
+
+    catalog_name: str = ""
+    catalog_type: CatalogType = None
+    total_rows: int = None
+
+    CATALOG_TYPES = [t.value for t in CatalogType]
+
+    required_fields = ["catalog_type"]
+
+    def __post_init__(
+        self,
+    ):
+        self._check_required_fields()
+        if self.catalog_type not in self.CATALOG_TYPES:
+            raise ValueError(f"Unknown catalog type: {self.catalog_type}")
+
+    def __str__(self):
+        parameters = dataclasses.asdict(self)
+        formatted_string = ""
+        for name, value in parameters.items():
+            formatted_string += f"  {name} {value}\n"
+        return formatted_string
+
+    @classmethod
+    def read_from_metadata_file(cls, catalog_info_file: FilePointer) -> Self:
+        """Read catalog info from the `catalog_info.json` metadata file
+
+        Args:
+            catalog_info_file: FilePointer pointing to the `catalog_info.json` file
+
+        Returns:
+            A CatalogInfo object with the data from the `catalog_info.json` file
+        """
+        metadata_keywords = file_io.load_json_file(catalog_info_file)
+        catalog_info_keywords = {}
+        for field in dataclasses.fields(cls):
+            if field.name in metadata_keywords:
+                catalog_info_keywords[field.name] = metadata_keywords[field.name]
+        return cls(**catalog_info_keywords)
+
+    def _check_required_fields(self):
+        fields_dict = dataclasses.asdict(self)
+        for field_name in self.required_fields:
+            if field_name not in fields_dict or fields_dict[field_name] is None:
+                raise ValueError(
+                    f"{field_name} is required in the Catalog Info and a value must be provided"
+                )
diff --git a/src/hipscat/catalog/dataset/dataset.py b/src/hipscat/catalog/dataset/dataset.py
@@ -0,0 +1,72 @@
+from typing import Tuple, Type
+
+from typing_extensions import Self
+
+from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
+from hipscat.io import FilePointer, file_io, paths
+
+
+class Dataset:
+    """A base HiPSCat dataset
+
+    A base dataset contains a catalog_info metadata file and the data contained in parquet files
+    """
+    CatalogInfoClass: Type[BaseCatalogInfo] = BaseCatalogInfo
+
+    def __init__(
+        self,
+        catalog_info: CatalogInfoClass,
+        catalog_path=None,
+    ) -> None:
+        """Initializes a Dataset
+
+        Args:
+            catalog_info: A catalog_info object with the catalog metadata
+            catalog_path: If the catalog is stored on disk, specify the location of the catalog
+                Does not load the catalog from this path, only store as metadata
+        """
+        if not isinstance(catalog_info, self.CatalogInfoClass):
+            raise TypeError(f"catalog_info type must be {self.CatalogInfoClass}")
+
+        self.catalog_info = catalog_info
+        self.catalog_name = self.catalog_info.catalog_name
+
+        self.catalog_path = catalog_path
+        self.on_disk = catalog_path is not None
+        self.catalog_base_dir = file_io.get_file_pointer_from_path(self.catalog_path)
+
+    @classmethod
+    def read_from_hipscat(cls, catalog_path: str) -> Self:
+        """Reads a HiPSCat Catalog from a HiPSCat directory
+
+        Args:
+            catalog_path: path to the root directory of the catalog
+
+        Returns:
+            The initialized catalog object
+        """
+        catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path)
+        cls._check_files_exist(catalog_base_dir)
+        args = cls._read_args(catalog_base_dir)
+        kwargs = cls._read_kwargs(catalog_base_dir)
+        return cls(*args, **kwargs)
+
+    @classmethod
+    def _read_args(cls, catalog_base_dir: FilePointer) -> Tuple[CatalogInfoClass]:
+        catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
+        catalog_info = cls.CatalogInfoClass.read_from_metadata_file(catalog_info_file)
+        return (catalog_info,)
+
+    @classmethod
+    def _read_kwargs(cls, catalog_base_dir: FilePointer) -> dict:
+        return {"catalog_path": str(catalog_base_dir)}
+
+    @classmethod
+    def _check_files_exist(cls, catalog_base_dir: FilePointer):
+        if not file_io.does_file_or_directory_exist(catalog_base_dir):
+            raise FileNotFoundError(f"No directory exists at {str(catalog_base_dir)}")
+        catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
+        if not file_io.does_file_or_directory_exist(catalog_info_file):
+            raise FileNotFoundError(
+                f"No catalog info found where expected: {str(catalog_info_file)}"
+            )
diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py
@@ -2,7 +2,9 @@
 
 from typing import List
 
-from hipscat.io import file_io, paths
+import pandas as pd
+
+from hipscat.io import FilePointer, file_io
 from hipscat.pixel_math import HealpixPixel
 
 
@@ -13,16 +15,8 @@ class PartitionInfo:
     METADATA_DIR_COLUMN_NAME = "Dir"
     METADATA_PIXEL_COLUMN_NAME = "Npix"
 
-    def __init__(self, catalog_base_dir: file_io.FilePointer) -> None:
-        self.catalog_base_dir = catalog_base_dir
-
-        partition_info_pointer = paths.get_partition_info_pointer(self.catalog_base_dir)
-        if not file_io.does_file_or_directory_exist(partition_info_pointer):
-            raise FileNotFoundError(
-                f"No partition info found where expected: {str(partition_info_pointer)}"
-            )
-
-        self.data_frame = file_io.load_csv_to_pandas(partition_info_pointer)
+    def __init__(self, pixels: pd.DataFrame) -> None:
+        self.data_frame = pixels
 
     def get_healpix_pixels(self) -> List[HealpixPixel]:
         """Get healpix pixel objects for all pixels represented as partitions.
@@ -38,20 +32,20 @@ def get_healpix_pixels(self) -> List[HealpixPixel]:
             )
         ]
 
-    def get_file_names(self):
-        """Get file handles for all partition files in the catalog
+    @classmethod
+    def read_from_file(cls, partition_info_file: FilePointer):
+        """Read partition info from a `partition_info.csv` file to create an object
+
+        Args:
+            partition_info_file: FilePointer to the `partition_info.csv` file
 
         Returns:
-            one-dimensional array of strings, where each string is a partition file
+            A `PartitionInfo` object with the data from the file
         """
-        file_names = []
-        for _, partition in self.data_frame.iterrows():
-            file_names.append(
-                paths.pixel_catalog_file(
-                    self.catalog_base_dir,
-                    partition[self.METADATA_ORDER_COLUMN_NAME],
-                    partition[self.METADATA_PIXEL_COLUMN_NAME],
-                )
+        if not file_io.does_file_or_directory_exist(partition_info_file):
+            raise FileNotFoundError(
+                f"No partition info found where expected: {str(partition_info_file)}"
             )
 
-        return file_names
+        data_frame = file_io.load_csv_to_pandas(partition_info_file)
+        return cls(data_frame)
diff --git a/src/hipscat/pixel_tree/pixel_tree_builder.py b/src/hipscat/pixel_tree/pixel_tree_builder.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-from hipscat.catalog import PartitionInfo
+from hipscat.catalog.partition_info import PartitionInfo
 from hipscat.pixel_tree.pixel_node import PixelNode
 from hipscat.pixel_tree.pixel_node_type import PixelNodeType
 from hipscat.pixel_tree.pixel_tree import PixelTree

diff --git a/tests/data/catalog/catalog_info.json b/tests/data/catalog/catalog_info.json
@@ -0,0 +1,8 @@
+{
+  "catalog_name": "test_name",
+  "catalog_type": "object",
+  "epoch": "J2000",
+  "ra_column": "ra",
+  "dec_column": "dec",
+  "total_rows": 10
+}
diff --git a/tests/data/dataset/catalog_info.json b/tests/data/dataset/catalog_info.json
@@ -0,0 +1,5 @@
+{
+  "catalog_name": "test_name",
+  "catalog_type": "object",
+  "total_rows": 10
+}
diff --git a/tests/data/small_sky/catalog_info.json b/tests/data/small_sky/catalog_info.json
@@ -1,5 +1,6 @@
 {
     "catalog_name": "small_sky",
+    "catalog_type": "source",
     "version": "0.0.1",
     "generation_date": "2022.12.20",
     "epoch": "J2000",

diff --git a/tests/data/small_sky_order1/catalog_info.json b/tests/data/small_sky_order1/catalog_info.json
@@ -1,5 +1,6 @@
 {
     "catalog_name": "small_sky_order1",
+    "catalog_type": "source",
     "version": "0.0.0",
     "generation_date": "2022.12.21",
     "epoch": "J2000",