Skip to content

Commit

Permalink
Merge pull request MultimodalUniverse#32 from AstroPile/mastercat
Browse files Browse the repository at this point in the history
[Infrastructure] Add master catalogue
  • Loading branch information
lhparker1 authored Apr 11, 2024
2 parents 1877c79 + 3d4f8e4 commit f14beab
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ scripts/desi/*.fits
scripts/sdss/*.fits
notebooks/*.jpg

__pycache__
lightning_logs

# Excluding data files
scripts/hsc/**/*.hdf5
scripts/hsc/*.hdf
Expand Down
104 changes: 104 additions & 0 deletions astropile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
from multiprocessing import Pool
import numpy as np
import h5py
import pandas as pd
from astropy import units


def _file_to_catalog(filename: str, keys: List[str]):
with h5py.File(filename, 'r') as data:
return Table({k: data[k] for k in keys})


def get_catalog(dset: DatasetBuilder,
keys: List[str] = ['object_id', 'ra', 'dec', 'healpix'],
split: str = 'train',
Expand Down Expand Up @@ -158,3 +161,104 @@ def _generate_examples(groups):
description=description)


def extract_cat_params(cat: DatasetBuilder):
"""This just grabs the ra, dec, and healpix columns from a catalogue."""
cat = get_catalog(cat)
subcat = pd.DataFrame(data=dict((col, cat[col].data) for col in ["ra", "dec", "healpix"]))
return subcat


def build_master_catalog(cats: list[DatasetBuilder], names: list[str], matching_radius: float = 1.0):
"""
Build a master catalogue from a list of AstroPile catalogues. This extracts
minimal information from each catalogue and collates it into a single table.
The table is formatted as: ra, dec, healpix, name1, name2, ..., nameN,
name1_idx, name2_idx, ..., nameN_idx. where ra and dec are in arcsec,
healpix is a healpix index, name1, name2, ..., nameN are boolean flags
indicating whether a source is present in the corresponding catalogue, and
name1_idx, name2_idx, ..., nameN_idx are the indices of the sources in the
corresponding catalogue.
Parameters
----------
cats : list[DatasetBuilder]
List of AstroPile catalogues to be combined.
names : list[str]
List of names for the catalogues. This will appear as the column header
in the master catalogue for that dataset.
matching_radius : float, optional
The maximum separation between two sources in the catalogues to be
considered a match, by default 1.0 [arcsec].
Returns
-------
master_cat : pd.DataFrame
The master catalogue containing the combined information from all the
input catalogues.
"""

if len(cats) != len(names):
raise ValueError("The number of catalogues and names must be the same.")

# Set the columns for the master catalogue
master_cat = pd.DataFrame(
columns=["ra", "dec", "healpix"] + names + [f"{name}_idx" for name in names]
)

for cat, name in zip(cats, names):
# Extract the relevant columns
cat = extract_cat_params(cat)

# Match the catalogues
master_coords = SkyCoord(master_cat.loc[:, "ra"], master_cat.loc[:, "dec"], unit="deg")
cat_coords = SkyCoord(cat.loc[:, "ra"], cat.loc[:, "dec"], unit="deg")
idx, sep2d, _ = master_coords.match_to_catalog_sky(cat_coords)
mask = sep2d < matching_radius * units.arcsec

# Update the matching columns
master_cat.loc[mask, name] = True
master_cat.loc[mask, name + "_idx"] = idx[mask]

# Add new rows to the master catalogue
if len(master_cat) == 0:
# keep everything for first catalogue
mask = np.zeros(len(cat), dtype=bool)
else:
# match to master catalogue so far
idx, sep2d, _ = cat_coords.match_to_catalog_sky(master_coords)
mask = sep2d < matching_radius * units.arcsec
idx = np.arange(len(cat), dtype=int)
name_data = []
name_idx_data = []
for subname in names:
if subname != name:
# Add rows for each catalogue. These are False becaue they didn't match
name_data.append(np.zeros(np.sum(~mask), dtype=bool))
name_idx_data.append(-np.ones(np.sum(~mask), dtype=int))
else:
# Add rows for the current catalogue. These are True because they are the current catalogue
name_data.append(np.ones(np.sum(~mask), dtype=bool))
name_idx_data.append(idx[~mask])
# Collect the new rows into a DataFrame
append_cat = pd.DataFrame(
columns=["ra", "dec", "healpix"] + names + [f"{name}_idx" for name in names],
data=np.stack(
[cat.loc[~mask, col] for col in ["ra", "dec", "healpix"]]
+ name_data
+ name_idx_data
).T,
)

# Append the new rows to the master catalogue
master_cat = pd.concat([master_cat, append_cat], ignore_index=True)

# Convert the columns to the correct data types
master_cat["ra"] = master_cat["ra"].astype(float)
master_cat["dec"] = master_cat["dec"].astype(float)
master_cat["healpix"] = master_cat["healpix"].astype(int)
for name in names:
master_cat[name] = master_cat[name].astype(bool)
master_cat[f"{name}_idx"] = master_cat[f"{name}_idx"].astype(int)

return master_cat

0 comments on commit f14beab

Please sign in to comment.