From b262db25716a4c3ff0fec851e6087925b4a54f33 Mon Sep 17 00:00:00 2001 From: Sandro Campos Date: Tue, 18 Jun 2024 10:31:47 -0400 Subject: [PATCH] Review PR comments --- src/lsdb/catalog/dataset/healpix_dataset.py | 4 +++- .../loaders/hipscat/abstract_catalog_loader.py | 16 ++++++++++------ src/lsdb/types.py | 12 ++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py index 9112925a..c688232d 100644 --- a/src/lsdb/catalog/dataset/healpix_dataset.py +++ b/src/lsdb/catalog/dataset/healpix_dataset.py @@ -136,7 +136,9 @@ def _perform_search( """Performs a search on the catalog from a list of pixels to search in Args: - metadata (hc.catalog.Catalog | hc.catalog.MarginCatalog): The metadata of the hipscat catalog. + metadata (hc.catalog.Catalog | hc.catalog.MarginCatalog): The metadata of + the hipscat catalog after the coarse filtering is applied. The partitions + it contains are only those that overlap with the spatial region. search (AbstractSearch): Instance of AbstractSearch. Returns: diff --git a/src/lsdb/loaders/hipscat/abstract_catalog_loader.py b/src/lsdb/loaders/hipscat/abstract_catalog_loader.py index 52f74320..1666c22c 100644 --- a/src/lsdb/loaders/hipscat/abstract_catalog_loader.py +++ b/src/lsdb/loaders/hipscat/abstract_catalog_loader.py @@ -1,20 +1,24 @@ from __future__ import annotations from abc import abstractmethod -from typing import Generic, List, Tuple, Type +from typing import Generic, List, Tuple, Type, TypeVar import dask.dataframe as dd import hipscat as hc import numpy as np import pandas as pd +from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset from hipscat.io.file_io import file_io from hipscat.pixel_math import HealpixPixel from hipscat.pixel_math.healpix_pixel_function import get_pixel_argsort from lsdb.catalog.catalog import DaskDFPixelMap +from lsdb.catalog.dataset.dataset import Dataset from lsdb.dask.divisions import get_pixels_divisions from lsdb.loaders.hipscat.hipscat_loading_config import HipscatLoadingConfig -from lsdb.types import CatalogTypeVar, HCCatalogTypeVar + +CatalogTypeVar = TypeVar("CatalogTypeVar", bound=Dataset) +HCCatalogTypeVar = TypeVar("HCCatalogTypeVar", bound=HCHealpixDataset) class AbstractCatalogLoader(Generic[CatalogTypeVar]): @@ -46,7 +50,7 @@ def _load_hipscat_catalog(self, catalog_type: Type[HCCatalogTypeVar]) -> HCCatal """Load `hipscat` library catalog object with catalog metadata and partition data""" return catalog_type.read_from_hipscat(self.path, storage_options=self.storage_options) - def _load_dask_df_and_map(self, catalog: HCCatalogTypeVar) -> Tuple[dd.core.DataFrame, DaskDFPixelMap]: + def _load_dask_df_and_map(self, catalog: HCHealpixDataset) -> Tuple[dd.core.DataFrame, DaskDFPixelMap]: """Load Dask DF from parquet files and make dict of HEALPix pixel to partition index""" pixels = catalog.get_healpix_pixels() ordered_pixels = np.array(pixels)[get_pixel_argsort(pixels)] @@ -57,7 +61,7 @@ def _load_dask_df_and_map(self, catalog: HCCatalogTypeVar) -> Tuple[dd.core.Data return ddf, pixel_to_index_map def _get_paths_from_pixels( - self, catalog: HCCatalogTypeVar, ordered_pixels: List[HealpixPixel] + self, catalog: HCHealpixDataset, ordered_pixels: List[HealpixPixel] ) -> List[hc.io.FilePointer]: paths = hc.io.paths.pixel_catalog_files( catalog.catalog_base_dir, ordered_pixels, self.storage_options @@ -65,7 +69,7 @@ def _get_paths_from_pixels( return paths def _load_df_from_paths( - self, catalog: HCCatalogTypeVar, paths: List[hc.io.FilePointer], divisions: Tuple[int, ...] | None + self, catalog: HCHealpixDataset, paths: List[hc.io.FilePointer], divisions: Tuple[int, ...] | None ) -> dd.core.DataFrame: dask_meta_schema = self._load_metadata_schema(catalog) if self.config.columns: @@ -85,7 +89,7 @@ def _load_df_from_paths( ) return dd.io.from_pandas(dask_meta_schema, npartitions=1) - def _load_metadata_schema(self, catalog: HCCatalogTypeVar) -> pd.DataFrame: + def _load_metadata_schema(self, catalog: HCHealpixDataset) -> pd.DataFrame: metadata_pointer = hc.io.paths.get_common_metadata_pointer(catalog.catalog_base_dir) metadata = file_io.read_parquet_metadata(metadata_pointer, storage_options=self.storage_options) return ( diff --git a/src/lsdb/types.py b/src/lsdb/types.py index 8573f061..65a544f6 100644 --- a/src/lsdb/types.py +++ b/src/lsdb/types.py @@ -1,19 +1,11 @@ -from __future__ import annotations +from typing import Dict, List, Tuple -from typing import Dict, List, Tuple, TypeVar - -from hipscat.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset from hipscat.pixel_math import HealpixPixel from typing_extensions import TypeAlias -from lsdb.catalog.dataset.dataset import Dataset - # Compute pixel map returns a tuple. The first element is # the number of data points within the HEALPix pixel, the # second element is the list of pixels it contains. HealpixInfo: TypeAlias = Tuple[int, List[int]] -DaskDFPixelMap = Dict[HealpixPixel, int] -# Generic lsdb and hipscat catalog types -CatalogTypeVar = TypeVar("CatalogTypeVar", bound=Dataset) -HCCatalogTypeVar = TypeVar("HCCatalogTypeVar", bound=HCHealpixDataset) +DaskDFPixelMap = Dict[HealpixPixel, int]