Skip to content

Commit

Permalink
Merge pull request #85 from astronomy-commons/sean/catalog-refactor
Browse files Browse the repository at this point in the history
Refactor Catalogs to inherit from base Dataset and not initialize from files
  • Loading branch information
smcguire-cmu authored Apr 24, 2023
2 parents 18c389d + e8581e4 commit bdde9aa
Show file tree
Hide file tree
Showing 21 changed files with 543 additions and 81 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ dependencies = [
"setuptools_scm",
"pyarrow",
"astropy",
"regions"
"regions",
"typing-extensions"
]

# On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)
Expand Down
1 change: 1 addition & 0 deletions src/hipscat/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@

from .catalog import Catalog
from .catalog_parameters import CatalogParameters
from .catalog_type import CatalogType
from .partition_info import PartitionInfo
100 changes: 72 additions & 28 deletions src/hipscat/catalog/catalog.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,71 @@
"""Container class to hold catalog metadata and partition iteration"""
from __future__ import annotations

from typing import Tuple, Union

from hipscat.catalog.catalog_parameters import read_from_metadata_file
from hipscat.catalog.partition_info import PartitionInfo
from hipscat.io import file_io, paths
import pandas as pd

from hipscat.catalog.catalog_info import CatalogInfo
from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.dataset.dataset import Dataset
from hipscat.catalog.partition_info import PartitionInfo
from hipscat.io import FilePointer, file_io, paths
from hipscat.pixel_tree.pixel_tree import PixelTree
from hipscat.pixel_tree.pixel_tree_builder import PixelTreeBuilder

class Catalog:
"""Container class for catalog metadata"""

def __init__(self, catalog_path: str = None) -> None:
self.catalog_path = catalog_path
self.catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path)
self.metadata_keywords = None
class Catalog(Dataset):
"""A HiPSCat Catalog with data stored in a HEALPix Hive partitioned structure
self.partition_info = None
self.catalog_info = None
Catalogs of this type are partitioned spatially, contain `partition_info` metadata specifying
the pixels in Catalog, and on disk conform to the parquet partitioning structure
`Norder=/Dir=/Npix=.parquet`
"""

self.catalog_name = None
self.catalog_type = None
CatalogInfoClass = CatalogInfo
PixelInputTypes = Union[pd.DataFrame, PartitionInfo]
HIPS_CATALOG_TYPES = [CatalogType.OBJECT, CatalogType.SOURCE, CatalogType.MARGIN]

self._initialize_metadata()
def __init__(
self,
catalog_info: CatalogInfoClass,
pixels: PixelInputTypes,
catalog_path=None,
) -> None:
"""Initializes a Catalog
def _initialize_metadata(self):
if not file_io.does_file_or_directory_exist(self.catalog_base_dir):
raise FileNotFoundError(
f"No directory exists at {str(self.catalog_base_dir)}"
)
catalog_info_file = paths.get_catalog_info_pointer(self.catalog_base_dir)
if not file_io.does_file_or_directory_exist(catalog_info_file):
raise FileNotFoundError(
f"No catalog info found where expected: {str(catalog_info_file)}"
Args:
catalog_info: CatalogInfo object with catalog metadata
pixels: Specifies the pixels contained in the catalog. Can be either a Dataframe with
columns `Norder`, `Dir`, and `Npix` matching a `partition_info.csv` file, or a
PartitionInfo object
catalog_path: If the catalog is stored on disk, specify the location of the catalog
Does not load the catalog from this path, only store as metadata
"""
if catalog_info.catalog_type not in self.HIPS_CATALOG_TYPES:
raise ValueError(
f"Catalog info `catalog_type` must be one of "
f"{', '.join([t.value for t in self.HIPS_CATALOG_TYPES])}"
)
self.catalog_info = read_from_metadata_file(catalog_info_file)
self.catalog_name = self.catalog_info.catalog_name
self.catalog_type = self.catalog_info.catalog_type
super().__init__(catalog_info, catalog_path)
self.partition_info = self._get_partition_info_from_pixels(pixels)
self.pixel_tree = self._get_pixel_tree_from_pixels(pixels)

if self.catalog_type in ("object", "source"):
self.partition_info = PartitionInfo(self.catalog_base_dir)
@staticmethod
def _get_partition_info_from_pixels(pixels: PixelInputTypes) -> PartitionInfo:
if isinstance(pixels, PartitionInfo):
return pixels
if isinstance(pixels, pd.DataFrame):
return PartitionInfo(pixels)
raise TypeError("Pixels must be of type PartitionInfo or Dataframe")

@staticmethod
def _get_pixel_tree_from_pixels(pixels: PixelInputTypes) -> PixelTree:
if isinstance(pixels, PartitionInfo):
return PixelTreeBuilder.from_partition_info_df(pixels.data_frame)
if isinstance(pixels, pd.DataFrame):
return PixelTreeBuilder.from_partition_info_df(pixels)
raise TypeError("Pixels must be of type PartitionInfo or Dataframe")

def get_pixels(self):
"""Get all healpix pixels that are contained in the catalog
Expand All @@ -52,3 +80,19 @@ def get_pixels(self):
- num_objects: the number of rows in the pixel's partition
"""
return self.partition_info.data_frame

@classmethod
def _read_args(cls, catalog_base_dir: FilePointer) -> Tuple[CatalogInfoClass, PartitionInfo]:
args = super()._read_args(catalog_base_dir)
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
partition_info = PartitionInfo.read_from_file(partition_info_file)
return args + (partition_info,)

@classmethod
def _check_files_exist(cls, catalog_base_dir: FilePointer):
super()._check_files_exist(catalog_base_dir)
partition_info_file = paths.get_partition_info_pointer(catalog_base_dir)
if not file_io.does_file_or_directory_exist(partition_info_file):
raise FileNotFoundError(
f"No partition info found where expected: {str(partition_info_file)}"
)
13 changes: 13 additions & 0 deletions src/hipscat/catalog/catalog_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dataclasses import dataclass

from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo


@dataclass
class CatalogInfo(BaseCatalogInfo):
"""Catalog Info for a HEALPix Hive partitioned Catalog"""
epoch: str = "J2000"
ra_column: str = "ra"
dec_column: str = "dec"

required_fields = BaseCatalogInfo.required_fields + ["epoch", "ra_column", "dec_column"]
11 changes: 11 additions & 0 deletions src/hipscat/catalog/catalog_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from enum import Enum


class CatalogType(str, Enum):
"""Enum for possible types of catalog"""

OBJECT = "object"
SOURCE = "source"
ASSOCIATION = "association"
INDEX = "index"
MARGIN = "margin"
59 changes: 59 additions & 0 deletions src/hipscat/catalog/dataset/base_catalog_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import dataclasses
from dataclasses import dataclass

from typing_extensions import Self

from hipscat.catalog.catalog_type import CatalogType
from hipscat.io import FilePointer, file_io


@dataclass
class BaseCatalogInfo:
"""Container class for catalog metadata"""

catalog_name: str = ""
catalog_type: CatalogType = None
total_rows: int = None

CATALOG_TYPES = [t.value for t in CatalogType]

required_fields = ["catalog_type"]

def __post_init__(
self,
):
self._check_required_fields()
if self.catalog_type not in self.CATALOG_TYPES:
raise ValueError(f"Unknown catalog type: {self.catalog_type}")

def __str__(self):
parameters = dataclasses.asdict(self)
formatted_string = ""
for name, value in parameters.items():
formatted_string += f" {name} {value}\n"
return formatted_string

@classmethod
def read_from_metadata_file(cls, catalog_info_file: FilePointer) -> Self:
"""Read catalog info from the `catalog_info.json` metadata file
Args:
catalog_info_file: FilePointer pointing to the `catalog_info.json` file
Returns:
A CatalogInfo object with the data from the `catalog_info.json` file
"""
metadata_keywords = file_io.load_json_file(catalog_info_file)
catalog_info_keywords = {}
for field in dataclasses.fields(cls):
if field.name in metadata_keywords:
catalog_info_keywords[field.name] = metadata_keywords[field.name]
return cls(**catalog_info_keywords)

def _check_required_fields(self):
fields_dict = dataclasses.asdict(self)
for field_name in self.required_fields:
if field_name not in fields_dict or fields_dict[field_name] is None:
raise ValueError(
f"{field_name} is required in the Catalog Info and a value must be provided"
)
72 changes: 72 additions & 0 deletions src/hipscat/catalog/dataset/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Tuple, Type

from typing_extensions import Self

from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo
from hipscat.io import FilePointer, file_io, paths


class Dataset:
"""A base HiPSCat dataset
A base dataset contains a catalog_info metadata file and the data contained in parquet files
"""
CatalogInfoClass: Type[BaseCatalogInfo] = BaseCatalogInfo

def __init__(
self,
catalog_info: CatalogInfoClass,
catalog_path=None,
) -> None:
"""Initializes a Dataset
Args:
catalog_info: A catalog_info object with the catalog metadata
catalog_path: If the catalog is stored on disk, specify the location of the catalog
Does not load the catalog from this path, only store as metadata
"""
if not isinstance(catalog_info, self.CatalogInfoClass):
raise TypeError(f"catalog_info type must be {self.CatalogInfoClass}")

self.catalog_info = catalog_info
self.catalog_name = self.catalog_info.catalog_name

self.catalog_path = catalog_path
self.on_disk = catalog_path is not None
self.catalog_base_dir = file_io.get_file_pointer_from_path(self.catalog_path)

@classmethod
def read_from_hipscat(cls, catalog_path: str) -> Self:
"""Reads a HiPSCat Catalog from a HiPSCat directory
Args:
catalog_path: path to the root directory of the catalog
Returns:
The initialized catalog object
"""
catalog_base_dir = file_io.get_file_pointer_from_path(catalog_path)
cls._check_files_exist(catalog_base_dir)
args = cls._read_args(catalog_base_dir)
kwargs = cls._read_kwargs(catalog_base_dir)
return cls(*args, **kwargs)

@classmethod
def _read_args(cls, catalog_base_dir: FilePointer) -> Tuple[CatalogInfoClass]:
catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
catalog_info = cls.CatalogInfoClass.read_from_metadata_file(catalog_info_file)
return (catalog_info,)

@classmethod
def _read_kwargs(cls, catalog_base_dir: FilePointer) -> dict:
return {"catalog_path": str(catalog_base_dir)}

@classmethod
def _check_files_exist(cls, catalog_base_dir: FilePointer):
if not file_io.does_file_or_directory_exist(catalog_base_dir):
raise FileNotFoundError(f"No directory exists at {str(catalog_base_dir)}")
catalog_info_file = paths.get_catalog_info_pointer(catalog_base_dir)
if not file_io.does_file_or_directory_exist(catalog_info_file):
raise FileNotFoundError(
f"No catalog info found where expected: {str(catalog_info_file)}"
)
40 changes: 17 additions & 23 deletions src/hipscat/catalog/partition_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from typing import List

from hipscat.io import file_io, paths
import pandas as pd

from hipscat.io import FilePointer, file_io
from hipscat.pixel_math import HealpixPixel


Expand All @@ -13,16 +15,8 @@ class PartitionInfo:
METADATA_DIR_COLUMN_NAME = "Dir"
METADATA_PIXEL_COLUMN_NAME = "Npix"

def __init__(self, catalog_base_dir: file_io.FilePointer) -> None:
self.catalog_base_dir = catalog_base_dir

partition_info_pointer = paths.get_partition_info_pointer(self.catalog_base_dir)
if not file_io.does_file_or_directory_exist(partition_info_pointer):
raise FileNotFoundError(
f"No partition info found where expected: {str(partition_info_pointer)}"
)

self.data_frame = file_io.load_csv_to_pandas(partition_info_pointer)
def __init__(self, pixels: pd.DataFrame) -> None:
self.data_frame = pixels

def get_healpix_pixels(self) -> List[HealpixPixel]:
"""Get healpix pixel objects for all pixels represented as partitions.
Expand All @@ -38,20 +32,20 @@ def get_healpix_pixels(self) -> List[HealpixPixel]:
)
]

def get_file_names(self):
"""Get file handles for all partition files in the catalog
@classmethod
def read_from_file(cls, partition_info_file: FilePointer):
"""Read partition info from a `partition_info.csv` file to create an object
Args:
partition_info_file: FilePointer to the `partition_info.csv` file
Returns:
one-dimensional array of strings, where each string is a partition file
A `PartitionInfo` object with the data from the file
"""
file_names = []
for _, partition in self.data_frame.iterrows():
file_names.append(
paths.pixel_catalog_file(
self.catalog_base_dir,
partition[self.METADATA_ORDER_COLUMN_NAME],
partition[self.METADATA_PIXEL_COLUMN_NAME],
)
if not file_io.does_file_or_directory_exist(partition_info_file):
raise FileNotFoundError(
f"No partition info found where expected: {str(partition_info_file)}"
)

return file_names
data_frame = file_io.load_csv_to_pandas(partition_info_file)
return cls(data_frame)
2 changes: 1 addition & 1 deletion src/hipscat/pixel_tree/pixel_tree_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd

from hipscat.catalog import PartitionInfo
from hipscat.catalog.partition_info import PartitionInfo
from hipscat.pixel_tree.pixel_node import PixelNode
from hipscat.pixel_tree.pixel_node_type import PixelNodeType
from hipscat.pixel_tree.pixel_tree import PixelTree
Expand Down
8 changes: 8 additions & 0 deletions tests/data/catalog/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"catalog_name": "test_name",
"catalog_type": "object",
"epoch": "J2000",
"ra_column": "ra",
"dec_column": "dec",
"total_rows": 10
}
5 changes: 5 additions & 0 deletions tests/data/dataset/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"catalog_name": "test_name",
"catalog_type": "object",
"total_rows": 10
}
1 change: 1 addition & 0 deletions tests/data/small_sky/catalog_info.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"catalog_name": "small_sky",
"catalog_type": "source",
"version": "0.0.1",
"generation_date": "2022.12.20",
"epoch": "J2000",
Expand Down
1 change: 1 addition & 0 deletions tests/data/small_sky_order1/catalog_info.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"catalog_name": "small_sky_order1",
"catalog_type": "source",
"version": "0.0.0",
"generation_date": "2022.12.21",
"epoch": "J2000",
Expand Down
Loading

0 comments on commit bdde9aa

Please sign in to comment.