diff --git a/src/hipscat/catalog/__init__.py b/src/hipscat/catalog/__init__.py index c4a77f49..c11cda1e 100644 --- a/src/hipscat/catalog/__init__.py +++ b/src/hipscat/catalog/__init__.py @@ -2,5 +2,6 @@ from .catalog import Catalog from .catalog_parameters import CatalogParameters +from .partition_info import PartitionInfo from .pixel_node import PixelNode from .pixel_node_type import PixelNodeType diff --git a/src/hipscat/catalog/catalog.py b/src/hipscat/catalog/catalog.py index c20242b9..48071fb6 100644 --- a/src/hipscat/catalog/catalog.py +++ b/src/hipscat/catalog/catalog.py @@ -3,9 +3,7 @@ import json import os -import pandas as pd - -from hipscat.io import paths +from hipscat.catalog.partition_info import PartitionInfo class Catalog: @@ -29,16 +27,11 @@ def _initialize_metadata(self): raise FileNotFoundError( f"No catalog info found where expected: {metadata_filename}" ) - partition_info_filename = os.path.join(self.catalog_path, "partition_info.csv") - if not os.path.exists(partition_info_filename): - raise FileNotFoundError( - f"No partition info found where expected: {partition_info_filename}" - ) with open(metadata_filename, "r", encoding="utf-8") as metadata_info: self.metadata_keywords = json.load(metadata_info) self.catalog_name = self.metadata_keywords["catalog_name"] - self.partition_info = pd.read_csv(partition_info_filename).copy() + self.partition_info = PartitionInfo(self.catalog_path) def get_pixels(self): """Get all healpix pixels that are contained in the catalog @@ -52,20 +45,4 @@ def get_pixels(self): - pixel: pixel number *at the above order* - num_objects: the number of rows in the pixel's partition """ - return self.partition_info - - def get_partitions(self): - """Get file handles for all partition files in the catalog - - Returns: - one-dimensional array of strings, where each string is a partition file - """ - file_names = [] - for _, partition in self.partition_info.iterrows(): - file_names.append( - paths.pixel_catalog_file( - self.catalog_path, partition["order"], partition["pixel"] - ) - ) - - return file_names + return self.partition_info.data_frame diff --git a/src/hipscat/catalog/partition_info.py b/src/hipscat/catalog/partition_info.py new file mode 100644 index 00000000..af6cf232 --- /dev/null +++ b/src/hipscat/catalog/partition_info.py @@ -0,0 +1,44 @@ +"""Container class to hold per-partition metadata""" + +import os + +import pandas as pd + +from hipscat.io import paths + + +class PartitionInfo: + """Container class for per-partition info.""" + + METADATA_ORDER_COLUMN_NAME = "Norder" + METADATA_DIR_COLUMN_NAME = "Dir" + METADATA_PIXEL_COLUMN_NAME = "Npix" + + def __init__(self, catalog_path=None): + self.catalog_path = catalog_path + + partition_info_filename = os.path.join(self.catalog_path, "partition_info.csv") + if not os.path.exists(partition_info_filename): + raise FileNotFoundError( + f"No partition info found where expected: {partition_info_filename}" + ) + + self.data_frame = pd.read_csv(partition_info_filename) + + def get_file_names(self): + """Get file handles for all partition files in the catalog + + Returns: + one-dimensional array of strings, where each string is a partition file + """ + file_names = [] + for _, partition in self.data_frame.iterrows(): + file_names.append( + paths.pixel_catalog_file( + self.catalog_path, + partition[self.METADATA_ORDER_COLUMN_NAME], + partition[self.METADATA_PIXEL_COLUMN_NAME], + ) + ) + + return file_names diff --git a/src/hipscat/catalog/pixel_tree.py b/src/hipscat/catalog/pixel_tree.py index 5d0383ec..d70e6903 100644 --- a/src/hipscat/catalog/pixel_tree.py +++ b/src/hipscat/catalog/pixel_tree.py @@ -2,7 +2,7 @@ import pandas as pd -from hipscat.catalog import PixelNode, PixelNodeType +from hipscat.catalog import PartitionInfo, PixelNode, PixelNodeType class PixelTree: @@ -20,9 +20,6 @@ class PixelTree: 12 base HEALPix pixels """ - METADATA_ORDER_COLUMN_NAME = "order" - METADATA_PIXEL_COLUMN_NAME = "pixel" - def __init__(self, partition_info_df: pd.DataFrame): """Initialises the Tree from the partition info metadata @@ -85,8 +82,8 @@ def _create_tree(self, partition_info_df: pd.DataFrame): """ for _, row in partition_info_df.iterrows(): self._create_node_and_parent_if_not_exist( - row[self.METADATA_ORDER_COLUMN_NAME], - row[self.METADATA_PIXEL_COLUMN_NAME], + row[PartitionInfo.METADATA_ORDER_COLUMN_NAME], + row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME], PixelNodeType.LEAF, ) diff --git a/src/hipscat/io/paths.py b/src/hipscat/io/paths.py index c486d0c2..e8f24e83 100644 --- a/src/hipscat/io/paths.py +++ b/src/hipscat/io/paths.py @@ -2,32 +2,60 @@ import os +ORDER_DIRECTORY_PREFIX = "Norder" +DIR_DIRECTORY_PREFIX = "Dir" +PIXEL_DIRECTORY_PREFIX = "Npix" -def pixel_directory(catalog_path="", pixel_order=0, pixel_number=0): + +def pixel_directory( + catalog_path="", pixel_order=0, pixel_number=None, directory_number=None +): """Create path *string* for a pixel directory. This will not create the directory. - The directory name will take the form of: + One of pixel_number or directory_number is required. The directory name will + take the HiPS standard form of: + + /Norder=/Dir= + + Where the directory number is calculated using integer division as: - /Norder/Npix/ + (pixel_number/10000)*10000 Args: catalog_path (str): base directory of the catalog (includes catalog name) pixel_order (int): the healpix order of the pixel + directory_number (int): directory number pixel_number (int): the healpix pixel Returns: string directory name """ + norder = int(pixel_order) + if pixel_number is None and directory_number is None: + raise ValueError( + "One of pixel_number or directory_number is required to create pixel directory" + ) + if directory_number is not None: + ndir = directory_number + else: + npix = int(pixel_number) + ndir = int(npix / 10_000) * 10_000 return os.path.join( - catalog_path, f"Norder{int(pixel_order)}/Npix{int(pixel_number)}" + catalog_path, + f"{ORDER_DIRECTORY_PREFIX}={norder}", + f"{DIR_DIRECTORY_PREFIX}={ndir}", ) def pixel_catalog_file(catalog_path="", pixel_order=0, pixel_number=0): """Create path *string* for a pixel catalog file. This will not create the directory or file. - The catalog file name will take the form of: + The catalog file name will take the HiPS standard form of: + + /Norder=/Dir=/Npix=.parquet + + Where the directory number is calculated using integer division as: - /Norder/Npix/catalog.parquet + (pixel_number/10000)*10000 Args: catalog_path (str): base directory of the catalog (includes catalog name) @@ -36,8 +64,12 @@ def pixel_catalog_file(catalog_path="", pixel_order=0, pixel_number=0): Returns: string catalog file name """ + norder = int(pixel_order) + npix = int(pixel_number) + ndir = int(npix / 10_000) * 10_000 return os.path.join( catalog_path, - f"Norder{int(pixel_order)}/Npix{int(pixel_number)}", - "catalog.parquet", + f"{ORDER_DIRECTORY_PREFIX}={norder}", + f"{DIR_DIRECTORY_PREFIX}={ndir}", + f"{PIXEL_DIRECTORY_PREFIX}={npix}.parquet", ) diff --git a/tests/data/small_sky/Norder0/Npix11/catalog.parquet b/tests/data/small_sky/Norder=0/Dir=0/Npix=11.parquet similarity index 100% rename from tests/data/small_sky/Norder0/Npix11/catalog.parquet rename to tests/data/small_sky/Norder=0/Dir=0/Npix=11.parquet diff --git a/tests/data/small_sky/partition_info.csv b/tests/data/small_sky/partition_info.csv index e0f5d571..ed015721 100644 --- a/tests/data/small_sky/partition_info.csv +++ b/tests/data/small_sky/partition_info.csv @@ -1,2 +1,2 @@ -order,pixel,num_objects -0,11,131 +Norder,Dir,Npix,num_rows +0,0,11,131 diff --git a/tests/data/small_sky_order1/Norder1/Npix44/catalog.parquet b/tests/data/small_sky_order1/Norder=1/Dir=0/Npix=44.parquet similarity index 100% rename from tests/data/small_sky_order1/Norder1/Npix44/catalog.parquet rename to tests/data/small_sky_order1/Norder=1/Dir=0/Npix=44.parquet diff --git a/tests/data/small_sky_order1/Norder1/Npix45/catalog.parquet b/tests/data/small_sky_order1/Norder=1/Dir=0/Npix=45.parquet similarity index 100% rename from tests/data/small_sky_order1/Norder1/Npix45/catalog.parquet rename to tests/data/small_sky_order1/Norder=1/Dir=0/Npix=45.parquet diff --git a/tests/data/small_sky_order1/Norder1/Npix46/catalog.parquet b/tests/data/small_sky_order1/Norder=1/Dir=0/Npix=46.parquet similarity index 100% rename from tests/data/small_sky_order1/Norder1/Npix46/catalog.parquet rename to tests/data/small_sky_order1/Norder=1/Dir=0/Npix=46.parquet diff --git a/tests/data/small_sky_order1/Norder1/Npix47/catalog.parquet b/tests/data/small_sky_order1/Norder=1/Dir=0/Npix=47.parquet similarity index 100% rename from tests/data/small_sky_order1/Norder1/Npix47/catalog.parquet rename to tests/data/small_sky_order1/Norder=1/Dir=0/Npix=47.parquet diff --git a/tests/data/small_sky_order1/partition_info.csv b/tests/data/small_sky_order1/partition_info.csv index 030df091..d15927f2 100644 --- a/tests/data/small_sky_order1/partition_info.csv +++ b/tests/data/small_sky_order1/partition_info.csv @@ -1,5 +1,5 @@ -order,pixel,num_objects -1,44,42 -1,45,29 -1,46,42 -1,47,18 +Norder,Dir,Npix,num_rows +1,0,44,42 +1,0,45,29 +1,0,46,42 +1,0,47,18 diff --git a/tests/hipscat/catalog/test_catalog.py b/tests/hipscat/catalog/test_catalog.py index 90eff871..f2f32e34 100644 --- a/tests/hipscat/catalog/test_catalog.py +++ b/tests/hipscat/catalog/test_catalog.py @@ -14,7 +14,6 @@ def test_load_catalog_small_sky(small_sky_dir): assert cat.catalog_name == "small_sky" assert len(cat.get_pixels()) == 1 - assert len(cat.get_partitions()) == 1 def test_load_catalog_small_sky_order1(small_sky_order1_dir): @@ -23,7 +22,6 @@ def test_load_catalog_small_sky_order1(small_sky_order1_dir): assert cat.catalog_name == "small_sky_order1" assert len(cat.get_pixels()) == 4 - assert len(cat.get_partitions()) == 4 def test_empty_directory(): diff --git a/tests/hipscat/catalog/test_partition_info.py b/tests/hipscat/catalog/test_partition_info.py new file mode 100644 index 00000000..fdce8db5 --- /dev/null +++ b/tests/hipscat/catalog/test_partition_info.py @@ -0,0 +1,27 @@ +"""Tests of partition info functionality""" + +import os + +from hipscat.catalog import PartitionInfo + + +def test_load_partition_info_small_sky(small_sky_dir): + """Instantiate the partition info for catalog with 1 pixel""" + partitions = PartitionInfo(small_sky_dir) + + partition_file_list = partitions.get_file_names() + assert len(partition_file_list) == 1 + + for parquet_file in partition_file_list: + assert os.path.exists(parquet_file) + + +def test_load_partition_info_small_sky_order1(small_sky_order1_dir): + """Instantiate the partition info for catalog with 4 pixels""" + partitions = PartitionInfo(small_sky_order1_dir) + + partition_file_list = partitions.get_file_names() + assert len(partition_file_list) == 4 + + for parquet_file in partition_file_list: + assert os.path.exists(parquet_file) diff --git a/tests/hipscat/catalog/test_pixel_tree.py b/tests/hipscat/catalog/test_pixel_tree.py index 3b19827d..5d300bb4 100644 --- a/tests/hipscat/catalog/test_pixel_tree.py +++ b/tests/hipscat/catalog/test_pixel_tree.py @@ -1,7 +1,7 @@ import pandas as pd import pytest -from hipscat.catalog import PixelNodeType +from hipscat.catalog import PartitionInfo, PixelNodeType from hipscat.catalog.pixel_tree import PixelTree @@ -9,12 +9,15 @@ def assert_pixel_tree_has_nodes_in_catalog(tree, catalog): """assert tree contains the same nodes as the catalog""" assert tree.contains(-1, -1) for _, pixel in catalog.get_pixels().iterrows(): - assert tree.contains(pixel["order"], pixel["pixel"]) + assert tree.contains( + pixel[PartitionInfo.METADATA_ORDER_COLUMN_NAME], + pixel[PartitionInfo.METADATA_PIXEL_COLUMN_NAME], + ) def test_pixel_tree_small_sky(small_sky_catalog, small_sky_pixels): """test pixel tree on small sky""" - pixel_tree = PixelTree(small_sky_catalog.partition_info) + pixel_tree = PixelTree(small_sky_catalog.get_pixels()) assert len(pixel_tree) == len(small_sky_catalog.get_pixels()) + 1 assert_pixel_tree_has_nodes_in_catalog(pixel_tree, small_sky_catalog) small_sky_pixel = pixel_tree.get_node(**small_sky_pixels[0]) @@ -24,7 +27,7 @@ def test_pixel_tree_small_sky(small_sky_catalog, small_sky_pixels): def test_pixel_tree_small_sky_order1(small_sky_order1_catalog, small_sky_order1_pixels): """test pixel tree on small sky order1""" - pixel_tree = PixelTree(small_sky_order1_catalog.partition_info) + pixel_tree = PixelTree(small_sky_order1_catalog.get_pixels()) assert_pixel_tree_has_nodes_in_catalog(pixel_tree, small_sky_order1_catalog) first_pixel = pixel_tree.get_node(**small_sky_order1_pixels[0]) second_pixel = pixel_tree.get_node(**small_sky_order1_pixels[1]) @@ -36,7 +39,7 @@ def test_pixel_tree_small_sky_order1(small_sky_order1_catalog, small_sky_order1_ def test_duplicate_pixel_raises_error(small_sky_catalog): """test pixel tree raises error with duplicate pixels""" - partition_info = small_sky_catalog.partition_info + partition_info = small_sky_catalog.get_pixels() pixel_row = partition_info.iloc[0] info_with_duplicate = pd.concat([partition_info, pixel_row.to_frame().T]) with pytest.raises(ValueError): @@ -45,10 +48,12 @@ def test_duplicate_pixel_raises_error(small_sky_catalog): def test_pixel_duplicated_at_different_order_raises_error(small_sky_catalog): """test pixel tree raises error with duplicate pixels at different orders""" - partition_info = small_sky_catalog.partition_info + partition_info = small_sky_catalog.get_pixels() pixel_row = partition_info.iloc[0].copy() - pixel_row["order"] += 1 - pixel_row["pixel"] = pixel_row["pixel"] << 2 + pixel_row[PartitionInfo.METADATA_ORDER_COLUMN_NAME] += 1 + pixel_row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] = ( + pixel_row[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] << 2 + ) info_with_duplicate = pd.concat([partition_info, pixel_row.to_frame().T]) with pytest.raises(ValueError): PixelTree(info_with_duplicate) diff --git a/tests/hipscat/io/test_paths.py b/tests/hipscat/io/test_paths.py index 8aeadc03..0f131797 100644 --- a/tests/hipscat/io/test_paths.py +++ b/tests/hipscat/io/test_paths.py @@ -7,11 +7,29 @@ def test_pixel_directory(): """Simple case with sensical inputs""" - expected = "/foo/Norder0/Npix5" + expected = "/foo/Norder=0/Dir=0" result = paths.pixel_directory("/foo", 0, 5) assert result == expected +def test_pixel_directory_number(): + """Simple case with sensical inputs""" + expected = "/foo/Norder=0/Dir=0" + result = paths.pixel_directory( + "/foo", pixel_order=0, pixel_number=5, directory_number=0 + ) + assert result == expected + + result = paths.pixel_directory("/foo", pixel_order=0, directory_number=0) + assert result == expected + + +def test_pixel_directory_missing(): + """Simple case with missing inputs""" + with pytest.raises(ValueError): + paths.pixel_directory("/foo", 0) + + def test_pixel_directory_nonint(): """Simple case with non-integer inputs""" with pytest.raises(ValueError): @@ -20,7 +38,7 @@ def test_pixel_directory_nonint(): def test_pixel_catalog_file(): """Simple case with sensical inputs""" - expected = "/foo/Norder0/Npix5/catalog.parquet" + expected = "/foo/Norder=0/Dir=0/Npix=5.parquet" result = paths.pixel_catalog_file("/foo", 0, 5) assert result == expected