Skip to content

Commit

Permalink
Merge pull request #357 from astronomy-commons/sean/fix-map-parts
Browse files Browse the repository at this point in the history
Support map_partitions ufunc with non data frame output
  • Loading branch information
smcguire-cmu authored Jun 14, 2024
2 parents c56b6a4 + b329cd4 commit a621069
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/lsdb/catalog/dataset/healpix_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def map_partitions(
meta: pd.DataFrame | pd.Series | Dict | Iterable | Tuple | None = None,
include_pixel: bool = False,
**kwargs,
) -> Self:
) -> Self | dd.core.Series:
"""Applies a function to each partition in the catalog.
The ra and dec of each row is assumed to remain unchanged.
Expand Down Expand Up @@ -210,7 +210,7 @@ def map_partitions(
Returns:
A new catalog with each partition replaced with the output of the function applied to the original
partition.
partition. If the function returns a non dataframe output, a dask Series will be returned.
"""
if meta is None:
if include_pixel:
Expand All @@ -234,7 +234,15 @@ def apply_func(df, *args, partition_info=None, **kwargs):
output_ddf = self._ddf.map_partitions(apply_func, *args, meta=meta, **kwargs)
else:
output_ddf = self._ddf.map_partitions(func, *args, meta=meta, **kwargs)
return self.__class__(output_ddf, self._ddf_pixel_map, self.hc_structure)

if isinstance(output_ddf, dd.core.DataFrame):
return self.__class__(output_ddf, self._ddf_pixel_map, self.hc_structure)
warnings.warn(
"output of the function must be a DataFrame to generate an LSDB `Catalog`. `map_partitions` "
"will return a dask object instead of a Catalog.",
RuntimeWarning,
)
return output_ddf

def prune_empty_partitions(self, persist: bool = False) -> Self:
"""Prunes the catalog of its empty partitions
Expand Down
13 changes: 13 additions & 0 deletions tests/lsdb/catalog/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,19 @@ def add_col(df):
assert np.all(mapcomp["a"] == mapcomp["ra"] + 1)


def test_map_partitions_non_df(small_sky_order1_catalog):
def get_col(df):
return df["ra"] + 1

with pytest.warns(RuntimeWarning, match="DataFrame"):
mapped = small_sky_order1_catalog.map_partitions(get_col)

assert not isinstance(mapped, Catalog)
assert isinstance(mapped, dd.core.Series)
mapcomp = mapped.compute()
assert np.all(mapcomp == small_sky_order1_catalog.compute()["ra"] + 1)


def test_non_working_empty_raises(small_sky_order1_catalog):
def add_col(df):
if len(df) == 0:
Expand Down

0 comments on commit a621069

Please sign in to comment.