Skip to content

Commit

Permalink
Merge pull request #44 from astronomy-commons/delucchi/issue/39
Browse files Browse the repository at this point in the history
Use hive formatting for directory/catalog path.
  • Loading branch information
delucchi-cmu authored Mar 13, 2023
2 parents 7476c6f + a3f1f40 commit 7695246
Show file tree
Hide file tree
Showing 16 changed files with 158 additions and 59 deletions.
1 change: 1 addition & 0 deletions src/hipscat/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

from .catalog import Catalog
from .catalog_parameters import CatalogParameters
from .partition_info import PartitionInfo
from .pixel_node import PixelNode
from .pixel_node_type import PixelNodeType
29 changes: 3 additions & 26 deletions src/hipscat/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import json
import os

import pandas as pd

from hipscat.io import paths
from hipscat.catalog.partition_info import PartitionInfo


class Catalog:
Expand All @@ -29,16 +27,11 @@ def _initialize_metadata(self):
raise FileNotFoundError(
f"No catalog info found where expected: {metadata_filename}"
)
partition_info_filename = os.path.join(self.catalog_path, "partition_info.csv")
if not os.path.exists(partition_info_filename):
raise FileNotFoundError(
f"No partition info found where expected: {partition_info_filename}"
)

with open(metadata_filename, "r", encoding="utf-8") as metadata_info:
self.metadata_keywords = json.load(metadata_info)
self.catalog_name = self.metadata_keywords["catalog_name"]
self.partition_info = pd.read_csv(partition_info_filename).copy()
self.partition_info = PartitionInfo(self.catalog_path)

def get_pixels(self):
"""Get all healpix pixels that are contained in the catalog
Expand All @@ -52,20 +45,4 @@ def get_pixels(self):
- pixel: pixel number *at the above order*
- num_objects: the number of rows in the pixel's partition
"""
return self.partition_info

def get_partitions(self):
"""Get file handles for all partition files in the catalog
Returns:
one-dimensional array of strings, where each string is a partition file
"""
file_names = []
for _, partition in self.partition_info.iterrows():
file_names.append(
paths.pixel_catalog_file(
self.catalog_path, partition["order"], partition["pixel"]
)
)

return file_names
return self.partition_info.data_frame
44 changes: 44 additions & 0 deletions src/hipscat/catalog/partition_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Container class to hold per-partition metadata"""

import os

import pandas as pd

from hipscat.io import paths


class PartitionInfo:
"""Container class for per-partition info."""

METADATA_ORDER_COLUMN_NAME = "Norder"
METADATA_DIR_COLUMN_NAME = "Dir"
METADATA_PIXEL_COLUMN_NAME = "Npix"

def __init__(self, catalog_path=None):
self.catalog_path = catalog_path

partition_info_filename = os.path.join(self.catalog_path, "partition_info.csv")
if not os.path.exists(partition_info_filename):
raise FileNotFoundError(
f"No partition info found where expected: {partition_info_filename}"
)

self.data_frame = pd.read_csv(partition_info_filename)

def get_file_names(self):
"""Get file handles for all partition files in the catalog
Returns:
one-dimensional array of strings, where each string is a partition file
"""
file_names = []
for _, partition in self.data_frame.iterrows():
file_names.append(
paths.pixel_catalog_file(
self.catalog_path,
partition[self.METADATA_ORDER_COLUMN_NAME],
partition[self.METADATA_PIXEL_COLUMN_NAME],
)
)

return file_names
9 changes: 3 additions & 6 deletions src/hipscat/catalog/pixel_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from hipscat.catalog import PixelNode, PixelNodeType
from hipscat.catalog import PartitionInfo, PixelNode, PixelNodeType


class PixelTree:
Expand All @@ -20,9 +20,6 @@ class PixelTree:
12 base HEALPix pixels
"""

METADATA_ORDER_COLUMN_NAME = "order"
METADATA_PIXEL_COLUMN_NAME = "pixel"

def __init__(self, partition_info_df: pd.DataFrame):
"""Initialises the Tree from the partition info metadata
Expand Down Expand Up @@ -85,8 +82,8 @@ def _create_tree(self, partition_info_df: pd.DataFrame):
"""
for _, row in partition_info_df.iterrows():
self._create_node_and_parent_if_not_exist(
row[self.METADATA_ORDER_COLUMN_NAME],
row[self.METADATA_PIXEL_COLUMN_NAME],
row[PartitionInfo.METADATA_ORDER_COLUMN_NAME],
row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME],
PixelNodeType.LEAF,
)

Expand Down
48 changes: 40 additions & 8 deletions src/hipscat/io/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,60 @@

import os

ORDER_DIRECTORY_PREFIX = "Norder"
DIR_DIRECTORY_PREFIX = "Dir"
PIXEL_DIRECTORY_PREFIX = "Npix"

def pixel_directory(catalog_path="", pixel_order=0, pixel_number=0):

def pixel_directory(
catalog_path="", pixel_order=0, pixel_number=None, directory_number=None
):
"""Create path *string* for a pixel directory. This will not create the directory.
The directory name will take the form of:
One of pixel_number or directory_number is required. The directory name will
take the HiPS standard form of:
<catalog_path>/Norder=<pixel_order>/Dir=<directory number>
Where the directory number is calculated using integer division as:
<catalog_path>/Norder<pixel_order>/Npix<pixel_number>/
(pixel_number/10000)*10000
Args:
catalog_path (str): base directory of the catalog (includes catalog name)
pixel_order (int): the healpix order of the pixel
directory_number (int): directory number
pixel_number (int): the healpix pixel
Returns:
string directory name
"""
norder = int(pixel_order)
if pixel_number is None and directory_number is None:
raise ValueError(
"One of pixel_number or directory_number is required to create pixel directory"
)
if directory_number is not None:
ndir = directory_number
else:
npix = int(pixel_number)
ndir = int(npix / 10_000) * 10_000
return os.path.join(
catalog_path, f"Norder{int(pixel_order)}/Npix{int(pixel_number)}"
catalog_path,
f"{ORDER_DIRECTORY_PREFIX}={norder}",
f"{DIR_DIRECTORY_PREFIX}={ndir}",
)


def pixel_catalog_file(catalog_path="", pixel_order=0, pixel_number=0):
"""Create path *string* for a pixel catalog file. This will not create the directory or file.
The catalog file name will take the form of:
The catalog file name will take the HiPS standard form of:
<catalog_path>/Norder=<pixel_order>/Dir=<directory number>/Npix=<pixel_number>.parquet
Where the directory number is calculated using integer division as:
<catalog_path>/Norder<pixel_order>/Npix<pixel_number>/catalog.parquet
(pixel_number/10000)*10000
Args:
catalog_path (str): base directory of the catalog (includes catalog name)
Expand All @@ -36,8 +64,12 @@ def pixel_catalog_file(catalog_path="", pixel_order=0, pixel_number=0):
Returns:
string catalog file name
"""
norder = int(pixel_order)
npix = int(pixel_number)
ndir = int(npix / 10_000) * 10_000
return os.path.join(
catalog_path,
f"Norder{int(pixel_order)}/Npix{int(pixel_number)}",
"catalog.parquet",
f"{ORDER_DIRECTORY_PREFIX}={norder}",
f"{DIR_DIRECTORY_PREFIX}={ndir}",
f"{PIXEL_DIRECTORY_PREFIX}={npix}.parquet",
)
4 changes: 2 additions & 2 deletions tests/data/small_sky/partition_info.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
order,pixel,num_objects
0,11,131
Norder,Dir,Npix,num_rows
0,0,11,131
10 changes: 5 additions & 5 deletions tests/data/small_sky_order1/partition_info.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
order,pixel,num_objects
1,44,42
1,45,29
1,46,42
1,47,18
Norder,Dir,Npix,num_rows
1,0,44,42
1,0,45,29
1,0,46,42
1,0,47,18
2 changes: 0 additions & 2 deletions tests/hipscat/catalog/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ def test_load_catalog_small_sky(small_sky_dir):

assert cat.catalog_name == "small_sky"
assert len(cat.get_pixels()) == 1
assert len(cat.get_partitions()) == 1


def test_load_catalog_small_sky_order1(small_sky_order1_dir):
Expand All @@ -23,7 +22,6 @@ def test_load_catalog_small_sky_order1(small_sky_order1_dir):

assert cat.catalog_name == "small_sky_order1"
assert len(cat.get_pixels()) == 4
assert len(cat.get_partitions()) == 4


def test_empty_directory():
Expand Down
27 changes: 27 additions & 0 deletions tests/hipscat/catalog/test_partition_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Tests of partition info functionality"""

import os

from hipscat.catalog import PartitionInfo


def test_load_partition_info_small_sky(small_sky_dir):
"""Instantiate the partition info for catalog with 1 pixel"""
partitions = PartitionInfo(small_sky_dir)

partition_file_list = partitions.get_file_names()
assert len(partition_file_list) == 1

for parquet_file in partition_file_list:
assert os.path.exists(parquet_file)


def test_load_partition_info_small_sky_order1(small_sky_order1_dir):
"""Instantiate the partition info for catalog with 4 pixels"""
partitions = PartitionInfo(small_sky_order1_dir)

partition_file_list = partitions.get_file_names()
assert len(partition_file_list) == 4

for parquet_file in partition_file_list:
assert os.path.exists(parquet_file)
21 changes: 13 additions & 8 deletions tests/hipscat/catalog/test_pixel_tree.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import pandas as pd
import pytest

from hipscat.catalog import PixelNodeType
from hipscat.catalog import PartitionInfo, PixelNodeType
from hipscat.catalog.pixel_tree import PixelTree


def assert_pixel_tree_has_nodes_in_catalog(tree, catalog):
"""assert tree contains the same nodes as the catalog"""
assert tree.contains(-1, -1)
for _, pixel in catalog.get_pixels().iterrows():
assert tree.contains(pixel["order"], pixel["pixel"])
assert tree.contains(
pixel[PartitionInfo.METADATA_ORDER_COLUMN_NAME],
pixel[PartitionInfo.METADATA_PIXEL_COLUMN_NAME],
)


def test_pixel_tree_small_sky(small_sky_catalog, small_sky_pixels):
"""test pixel tree on small sky"""
pixel_tree = PixelTree(small_sky_catalog.partition_info)
pixel_tree = PixelTree(small_sky_catalog.get_pixels())
assert len(pixel_tree) == len(small_sky_catalog.get_pixels()) + 1
assert_pixel_tree_has_nodes_in_catalog(pixel_tree, small_sky_catalog)
small_sky_pixel = pixel_tree.get_node(**small_sky_pixels[0])
Expand All @@ -24,7 +27,7 @@ def test_pixel_tree_small_sky(small_sky_catalog, small_sky_pixels):

def test_pixel_tree_small_sky_order1(small_sky_order1_catalog, small_sky_order1_pixels):
"""test pixel tree on small sky order1"""
pixel_tree = PixelTree(small_sky_order1_catalog.partition_info)
pixel_tree = PixelTree(small_sky_order1_catalog.get_pixels())
assert_pixel_tree_has_nodes_in_catalog(pixel_tree, small_sky_order1_catalog)
first_pixel = pixel_tree.get_node(**small_sky_order1_pixels[0])
second_pixel = pixel_tree.get_node(**small_sky_order1_pixels[1])
Expand All @@ -36,7 +39,7 @@ def test_pixel_tree_small_sky_order1(small_sky_order1_catalog, small_sky_order1_

def test_duplicate_pixel_raises_error(small_sky_catalog):
"""test pixel tree raises error with duplicate pixels"""
partition_info = small_sky_catalog.partition_info
partition_info = small_sky_catalog.get_pixels()
pixel_row = partition_info.iloc[0]
info_with_duplicate = pd.concat([partition_info, pixel_row.to_frame().T])
with pytest.raises(ValueError):
Expand All @@ -45,10 +48,12 @@ def test_duplicate_pixel_raises_error(small_sky_catalog):

def test_pixel_duplicated_at_different_order_raises_error(small_sky_catalog):
"""test pixel tree raises error with duplicate pixels at different orders"""
partition_info = small_sky_catalog.partition_info
partition_info = small_sky_catalog.get_pixels()
pixel_row = partition_info.iloc[0].copy()
pixel_row["order"] += 1
pixel_row["pixel"] = pixel_row["pixel"] << 2
pixel_row[PartitionInfo.METADATA_ORDER_COLUMN_NAME] += 1
pixel_row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] = (
pixel_row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] << 2
)
info_with_duplicate = pd.concat([partition_info, pixel_row.to_frame().T])
with pytest.raises(ValueError):
PixelTree(info_with_duplicate)
22 changes: 20 additions & 2 deletions tests/hipscat/io/test_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,29 @@

def test_pixel_directory():
"""Simple case with sensical inputs"""
expected = "/foo/Norder0/Npix5"
expected = "/foo/Norder=0/Dir=0"
result = paths.pixel_directory("/foo", 0, 5)
assert result == expected


def test_pixel_directory_number():
"""Simple case with sensical inputs"""
expected = "/foo/Norder=0/Dir=0"
result = paths.pixel_directory(
"/foo", pixel_order=0, pixel_number=5, directory_number=0
)
assert result == expected

result = paths.pixel_directory("/foo", pixel_order=0, directory_number=0)
assert result == expected


def test_pixel_directory_missing():
"""Simple case with missing inputs"""
with pytest.raises(ValueError):
paths.pixel_directory("/foo", 0)


def test_pixel_directory_nonint():
"""Simple case with non-integer inputs"""
with pytest.raises(ValueError):
Expand All @@ -20,7 +38,7 @@ def test_pixel_directory_nonint():

def test_pixel_catalog_file():
"""Simple case with sensical inputs"""
expected = "/foo/Norder0/Npix5/catalog.parquet"
expected = "/foo/Norder=0/Dir=0/Npix=5.parquet"
result = paths.pixel_catalog_file("/foo", 0, 5)
assert result == expected

Expand Down

0 comments on commit 7695246

Please sign in to comment.