Skip to content

Commit

Permalink
Merge pull request #101 from astronomy-commons/delucchi/catalog_info
Browse files Browse the repository at this point in the history
Delucchi/catalog info
  • Loading branch information
delucchi-cmu authored Jun 7, 2023
2 parents acbbbae + 6e61047 commit 4bfbfda
Show file tree
Hide file tree
Showing 17 changed files with 491 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass

from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo


Expand All @@ -8,11 +9,21 @@ class AssociationCatalogInfo(BaseCatalogInfo):
"""Catalog Info for a HiPSCat Association Catalog"""

primary_catalog: str = None
"""Catalog name for the primary (left) side of association"""

primary_column: str = None
"""Column name in the primary (left) side of join"""

join_catalog: str = None
"""Catalog name for the joining (right) side of association"""

join_column: str = None
"""Column name in the joining (right) side of join"""

required_fields = BaseCatalogInfo.required_fields + [
"primary_catalog",
"join_catalog",
]

DEFAULT_TYPE = CatalogType.ASSOCIATION
REQUIRED_TYPE = CatalogType.ASSOCIATION
10 changes: 9 additions & 1 deletion src/hipscat/catalog/catalog_info.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
from dataclasses import dataclass

from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo


@dataclass
class CatalogInfo(BaseCatalogInfo):
"""Catalog Info for a HEALPix Hive partitioned Catalog"""

epoch: str = "J2000"
ra_column: str = "ra"
dec_column: str = "dec"

required_fields = BaseCatalogInfo.required_fields + ["epoch", "ra_column", "dec_column"]
required_fields = BaseCatalogInfo.required_fields + [
"epoch",
"ra_column",
"dec_column",
]

DEFAULT_TYPE = CatalogType.OBJECT
5 changes: 5 additions & 0 deletions src/hipscat/catalog/catalog_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ class CatalogType(str, Enum):
ASSOCIATION = "association"
INDEX = "index"
MARGIN = "margin"

@classmethod
def all_types(cls):
"""Fetch a list of all catalog types"""
return [t.value for t in cls]
18 changes: 13 additions & 5 deletions src/hipscat/catalog/dataset/base_catalog_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,23 @@ class BaseCatalogInfo:
catalog_type: CatalogType = None
total_rows: int = None

CATALOG_TYPES = [t.value for t in CatalogType]
DEFAULT_TYPE = None
"""The default catalog type for this catalog info type. To be overridden by subclasses.
If specified, we will use this value when no catalog_type is provided."""

REQUIRED_TYPE = None
"""The required catalog type for this catalog info type. To be overridden by subclasses.
If specified, the catalog MUST have this type."""

required_fields = ["catalog_type"]

def __post_init__(
self,
):
def __post_init__(self):
if not self.catalog_type and self.DEFAULT_TYPE:
self.catalog_type = self.DEFAULT_TYPE
elif self.REQUIRED_TYPE and self.catalog_type != self.REQUIRED_TYPE:
raise ValueError(f"Catalog must have type {self.REQUIRED_TYPE}")
self._check_required_fields()
if self.catalog_type not in self.CATALOG_TYPES:
if self.catalog_type not in CatalogType.all_types():
raise ValueError(f"Unknown catalog type: {self.catalog_type}")

def __str__(self):
Expand Down
1 change: 1 addition & 0 deletions src/hipscat/catalog/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Dataset:
A base dataset contains a catalog_info metadata file and the data contained in parquet files
"""

CatalogInfoClass: Type[BaseCatalogInfo] = BaseCatalogInfo

def __init__(
Expand Down
29 changes: 29 additions & 0 deletions src/hipscat/catalog/index/index_catalog_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Catalog Info for a HiPSCat Index table"""

from dataclasses import dataclass, field
from typing import List

from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo


@dataclass
class IndexCatalogInfo(BaseCatalogInfo):
"""Catalog Info for a HiPSCat Index table"""

primary_catalog: str = None
"""Reference to object or source catalog"""

indexing_column: str = None
"""Column that we provide an index over"""

extra_columns: List[str] = field(default_factory=list)
"""Any additional payload columns included in index"""

required_fields = BaseCatalogInfo.required_fields + [
"primary_catalog",
"indexing_column",
]

DEFAULT_TYPE = CatalogType.INDEX
REQUIRED_TYPE = CatalogType.INDEX
25 changes: 25 additions & 0 deletions src/hipscat/catalog/margin_cache/margin_cache_catalog_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Catalog Info for a HiPSCat Margin cache table"""

from dataclasses import dataclass

from hipscat.catalog.catalog_type import CatalogType
from hipscat.catalog.dataset.base_catalog_info import BaseCatalogInfo


@dataclass
class MarginCacheCatalogInfo(BaseCatalogInfo):
"""Catalog Info for a HiPSCat Margin Cache table"""

primary_catalog: str = None
"""Reference to object or source catalog"""

margin_threshold: float = None
"""Threshold of the pixel boundary, expressed in arcseconds."""

required_fields = BaseCatalogInfo.required_fields + [
"primary_catalog",
"margin_threshold",
]

DEFAULT_TYPE = CatalogType.MARGIN
REQUIRED_TYPE = CatalogType.MARGIN
34 changes: 34 additions & 0 deletions src/hipscat/catalog/source_catalog/source_catalog_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Catalog Info for a HiPSCat Source (detection/timeseries) table"""

from dataclasses import dataclass

from hipscat.catalog.catalog_info import CatalogInfo
from hipscat.catalog.catalog_type import CatalogType


@dataclass
class SourceCatalogInfo(CatalogInfo):
"""Catalog Info for a HiPSCat Source (detection/timeseries) table.
Includes some optional specification for timeseries-level columns.
"""

primary_catalog: str = None
"""Object catalog reference"""

mjd_column: str = ""
"""Column name for time of observation"""

band_column: str = ""
"""Column name for photometric band"""

mag_column: str = ""
"""Column name for magnitude measurement"""

mag_err_column: str = ""
"""Column name for error in magnitude measurement"""

DEFAULT_TYPE = CatalogType.SOURCE
REQUIRED_TYPE = CatalogType.SOURCE

## NB: No additional required columns.
18 changes: 17 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,11 @@ def small_sky_to_small_sky_order1_dir(test_data_dir):
@pytest.fixture
def assert_catalog_info_matches_dict():
def assert_match(catalog_info: BaseCatalogInfo, dictionary: dict):
"""Check that all members of the catalog_info object match dictionary
elements, where specified."""
catalog_info_dict = dataclasses.asdict(catalog_info)
assert catalog_info_dict == dictionary
for key, value in dictionary.items():
assert catalog_info_dict[key] == value

return assert_match

Expand Down Expand Up @@ -135,6 +138,19 @@ def association_catalog_path(test_data_dir) -> str:
def association_catalog_info_file(association_catalog_path) -> str:
return os.path.join(association_catalog_path, "catalog_info.json")

@pytest.fixture
def index_catalog_info_file(test_data_dir) -> str:
return os.path.join(test_data_dir, "index_catalog", "catalog_info.json")

@pytest.fixture
def margin_cache_catalog_info_file(test_data_dir) -> str:
return os.path.join(test_data_dir, "margin_cache", "catalog_info.json")


@pytest.fixture
def source_catalog_info_file(test_data_dir) -> str:
return os.path.join(test_data_dir, "small_sky_source", "catalog_info.json")


@pytest.fixture
def association_catalog_info(association_catalog_info_data) -> AssociationCatalogInfo:
Expand Down
7 changes: 7 additions & 0 deletions tests/data/index_catalog/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"catalog_name": "test_index",
"catalog_type": "index",
"total_rows": 100,
"primary_catalog": "test_name",
"indexing_column": "id"
}
7 changes: 7 additions & 0 deletions tests/data/margin_cache/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"catalog_name": "test_margin",
"catalog_type": "margin",
"total_rows": 100,
"primary_catalog": "test_name",
"margin_threshold": 0.5
}
13 changes: 13 additions & 0 deletions tests/data/small_sky_source/catalog_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"catalog_name": "small_sky_source_catalog",
"catalog_type": "source",
"total_rows": 17161,
"epoch": "J2000",
"ra_column": "source_ra",
"dec_column": "source_dec",
"primary_catalog": "small_sky",
"mjd_column": "mjd",
"band_column": "band",
"mag_column": "mag",
"mag_err_column": ""
}
15 changes: 15 additions & 0 deletions tests/data/small_sky_source/partition_info.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Norder,Dir,Npix,num_rows
0,0,4,50
1,0,47,2395
2,0,176,385
2,0,177,1510
2,0,178,1634
2,0,179,1773
2,0,180,655
2,0,181,903
2,0,182,1246
2,0,183,1143
2,0,184,1390
2,0,185,2942
2,0,186,452
2,0,187,683
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import pytest

from hipscat.catalog.association_catalog.association_catalog_info import (
AssociationCatalogInfo,
)
from hipscat.catalog.association_catalog.association_catalog_info import \
AssociationCatalogInfo
from hipscat.catalog.catalog_type import CatalogType
from hipscat.io import file_io


Expand Down Expand Up @@ -46,10 +46,28 @@ def test_read_from_file(


def test_required_fields_missing(association_catalog_info_data):
for required_field in ["primary_catalog", "join_catalog"]:
required_fields = ["primary_catalog", "join_catalog"]
for required_field in required_fields:
assert required_field in AssociationCatalogInfo.required_fields
for field in AssociationCatalogInfo.required_fields:
for field in required_fields:
init_data = association_catalog_info_data.copy()
init_data[field] = None
with pytest.raises(ValueError, match=field):
AssociationCatalogInfo(**init_data)


def test_type_missing(association_catalog_info_data):
init_data = association_catalog_info_data.copy()
init_data["catalog_type"] = None
catalog_info = AssociationCatalogInfo(**init_data)
assert catalog_info.catalog_type == CatalogType.ASSOCIATION


def test_wrong_type(association_catalog_info_data, catalog_info_data):
with pytest.raises(TypeError, match="unexpected"):
AssociationCatalogInfo(**catalog_info_data)

with pytest.raises(ValueError, match="type association"):
init_data = association_catalog_info_data.copy()
init_data["catalog_type"] = CatalogType.OBJECT
AssociationCatalogInfo(**init_data)
Loading

0 comments on commit 4bfbfda

Please sign in to comment.