diff --git a/light_curves/code_src/gaia_functions.py b/light_curves/code_src/gaia_functions.py index a2689fac..464334c6 100644 --- a/light_curves/code_src/gaia_functions.py +++ b/light_curves/code_src/gaia_functions.py @@ -1,14 +1,17 @@ import time import numpy as np import pandas as pd + +from astropy.table import Table from astroquery.gaia import Gaia from data_structures import MultiIndexDFObject + def gaia_get_lightcurves(sample_table, *, search_radius=1/3600, verbose=0): ''' Creates a lightcurve Pandas MultiIndex object from Gaia data for a list of coordinates. - This is the MAIN function. - + This is the MAIN function. + Parameters ---------- sample_table : Astropy Table @@ -17,24 +20,24 @@ def gaia_get_lightcurves(sample_table, *, search_radius=1/3600, verbose=0): How far from a sources is ok for a match verbose : int How much to talk. 0 = None, 1 = a little bit , 2 = more, 3 = full - + Returns -------- MultiIndexDFObject with Gaia light curve photometry - + ''' - # This code is broken into two steps. The first step, `Gaia_retrieve_catalog` retrieves the + # This code is broken into two steps. The first step, `gaia_retrieve_catalog` retrieves the # Gaia source ids for the positions of our sample. These come from the "Gaia DR3 source lite catalog". # However, that catalog only has a single photometry point per object. To get the light curve - # information, we use the function `gaia_retrieve_epoch_photometry` to use the source ids to - # access the "EPOCH_PHOTOMETRY" catalog. - + # information, we use the function `gaia_retrieve_epoch_photometry` to use the source ids to + # access the "EPOCH_PHOTOMETRY" catalog. + # Retrieve Gaia table with Source IDs ============== - gaia_table = Gaia_retrieve_catalog(sample_table , - search_radius = search_radius, - verbose = verbose - ) - # if none of the objects were found, there's nothing to load and the Gaia_retrieve_EPOCH_PHOTOMETRY fnc + gaia_table = gaia_retrieve_catalog(sample_table, + search_radius=search_radius, + verbose=verbose + ) + # if none of the objects were found, there's nothing to load and the gaia_retrieve_epoch_photometry fnc # will raise an HTTPError. just return an empty dataframe instead of proceeding if len(gaia_table) == 0: return MultiIndexDFObject() @@ -42,94 +45,99 @@ def gaia_get_lightcurves(sample_table, *, search_radius=1/3600, verbose=0): # Extract Light curves =============== # request the EPOCH_PHOTOMETRY from the Gaia DataLink Service - gaia_df = Gaia_retrieve_epoch_photometry(gaia_table) - - #if the epochal photometry is empty, return an empty dataframe + gaia_df = gaia_retrieve_epoch_photometry(gaia_table) + + # if the epochal photometry is empty, return an empty dataframe if len(gaia_df) == 0: return MultiIndexDFObject() - - ## Create light curves ================= - df_lc = Gaia_clean_dataframe(gaia_df) + + # Create light curves ================= + df_lc = gaia_clean_dataframe(gaia_df) return df_lc -def Gaia_retrieve_catalog(sample_table , search_radius, verbose): + +def gaia_retrieve_catalog(sample_table, search_radius, verbose): ''' Retrieves the photometry table for a list of sources. - + Parameter - ---------- + ---------- sample_table : Astropy Table main source catalog with coordinates, labels, and objectids - - search_radius : float + + search_radius : float Search radius in degrees, e.g., 1/3600. suggested search radius is 1 arcsecond or 1/3600. - + verbose : int How much to talk. 0 = None, 1 = a little bit , 2 = more, 3 = full - + Returns -------- Astropy table with the Gaia photometry for each source. - + ''' t1 = time.time() - - #first make an astropy table from our master list of coordinates - # as input to the pyvo TAP query + + # first make an astropy table from our master list of coordinates + # as input to the pyvo TAP query upload_table = sample_table['objectid', 'label'] upload_table['ra'] = sample_table['coord'].ra.deg upload_table['dec'] = sample_table['coord'].dec.deg - #this query is too slow without gaia.random_index. - #Gaia helpdesk is aware of this bug somewhere on their end + # this query is too slow without gaia.random_index. + # Gaia helpdesk is aware of this bug somewhere on their end querystr = f""" SELECT gaia.ra, gaia.dec, gaia.random_index, gaia.source_id, mt.ra, mt.dec, mt.objectid, mt.label FROM tap_upload.table_test AS mt JOIN gaiadr3.gaia_source_lite AS gaia ON 1=CONTAINS(POINT('ICRS',mt.ra,mt.dec),CIRCLE('ICRS',gaia.ra,gaia.dec,{search_radius})) """ - #use an asynchronous query of the Gaia database - #cross match with our uploaded table + # use an asynchronous query of the Gaia database + # cross match with our uploaded table j = Gaia.launch_job_async(query=querystr, upload_resource=upload_table, upload_table_name="table_test") - + results = j.get_results() - - if verbose : print("\nSearch completed in {:.2f} seconds".format((time.time()-t1) ) ) - if verbose : print("Number of objects matched: {} out of {}.".format(len(results),len(sample_table) ) ) + + if verbose: + print(f"\nSearch completed in {time.time() - t1:.2f} seconds \n" + f"Number of objects matched: {len(results)} out of {len(sample_table)}.") return results -def Gaia_chunks(lst, n): + +def gaia_chunks(lst, n): """ "Split an input list into multiple chunks of size =< n" - + Parameters ---------- lst: list of gaia Ids n: int = maximum size of the desired chunk - + """ for i in range(0, len(lst), n): yield lst[i:i + n] - -def Gaia_retrieve_epoch_photometry(gaia_table): + + +def gaia_retrieve_epoch_photometry(gaia_table): """ Function to retrieve EPOCH_PHOTOMETRY catalog product for Gaia entries using the DataLink. Note that the IDs need to be DR3 source_id and needs to be a list. - - Code fragments taken from: https://www.cosmos.esa.int/web/gaia-users/archive/datalink-products#datalink_jntb_get_above_lim - + + Code fragments taken from: + https://www.cosmos.esa.int/web/gaia-users/archive/datalink-products#datalink_jntb_get_above_lim + Parameters ---------- gaia_table: Astropy Table catalog of gaia source ids as well as the coords, objectid, and labels of our targets - + Returns -------- Returns a dictionary (key = source_id) with a table of photometry as a function of time. - + """ # gaia datalink server has a threshold of max 5000 requests, @@ -137,21 +145,22 @@ def Gaia_retrieve_epoch_photometry(gaia_table): # and then send each chunk into the datalink server ids = list(gaia_table["source_id"]) dl_threshold = 5000 # Datalink server threshold - ids_chunks = list(Gaia_chunks(ids, dl_threshold)) + ids_chunks = list(gaia_chunks(ids, dl_threshold)) datalink_all = [] - #setup to request the epochal photometry - retrieval_type = "EPOCH_PHOTOMETRY" # Options are: 'EPOCH_PHOTOMETRY', 'MCMC_GSPPHOT', 'MCMC_MSC', 'XP_SAMPLED', 'XP_CONTINUOUS', 'RVS', 'ALL' - data_structure = "COMBINED" # Options are: 'INDIVIDUAL', 'COMBINED', 'RAW' - data_release = "Gaia DR3" # Options are: 'Gaia DR3' (default), 'Gaia DR2' + # setup to request the epochal photometry + # See astroquery.gaia.Gaia.load_data docs for options for parameter options as they may change + retrieval_type = "EPOCH_PHOTOMETRY" + data_structure = "RAW" + data_release = "Gaia DR3" for chunk in ids_chunks: datalink = Gaia.load_data(ids=chunk, - data_release = data_release, + data_release=data_release, retrieval_type=retrieval_type, - data_structure = data_structure, - verbose = False, - output_file = None , + data_structure=data_structure, + verbose=False, + valid_data=True, overwrite_output_file=True, format="votable") @@ -160,12 +169,33 @@ def Gaia_retrieve_epoch_photometry(gaia_table): # we want to extract the VO table, turn it into a pandas dataframe, and add it to the datalink_all list for list_of_tables in datalink.values(): for votable in list_of_tables: - datalink_all.append(votable.to_table().to_pandas()) + votable = votable.to_table() + # Filter out masked cells from the multidim rows, so we can convert them later to a + # MultiIndexDFObject avoiding the ``TypeError: unhashable type: 'MaskedConstant'``. + # We do it here before heading to pandas land as then there are + # way more complications with dealing with np.ma.MaskedArrays and pandas indexing and + # issues about view vs index and size of arrays. + # This is knowingly a terrible hack, btw + + flux = [] + flux_err = [] + mag = [] + obs_time = [] + for r_flux, r_flux_err, r_mag, r_obs_time in votable.iterrows( + 'g_transit_flux', 'g_transit_flux_error', 'g_transit_mag', 'g_transit_time'): + obs_time.append(r_obs_time[~r_flux.mask].data) + mag.append(r_mag[~r_flux.mask].data) + flux_err.append(r_flux_err[~r_flux.mask].data) + flux.append(r_flux[~r_flux.mask].data) + + votable.update(Table({'g_transit_time': obs_time, 'g_transit_mag': mag, + 'g_transit_flux_error': flux_err, 'g_transit_flux': flux})) + datalink_all.append(votable.to_pandas()) - #if there is no epochal photometry return an empty dataframe + # if there is no epochal photometry return an empty dataframe if len(datalink_all) == 0: return pd.DataFrame() - + datalink_all = pd.concat(datalink_all) # join with gaia_table to attach the objectid and label @@ -173,48 +203,56 @@ def Gaia_retrieve_epoch_photometry(gaia_table): gaia_source_df = gaia_table[idcols].to_pandas().set_index("source_id") gaia_df = datalink_all.set_index("source_id").join(gaia_source_df, how="left") - return gaia_df.reset_index() # clean and transform the data -def Gaia_clean_dataframe(gaia_df): +def gaia_clean_dataframe(gaia_df): """ Clean and transform the EPOCH_PHOTOMETRY dataframe in preparation to add to other light curves - + Parameters ---------- gaia_df: Pandas dataframe with light curve info - - + + Returns -------- MultiIndexDFObject with all Gaia light curves - + """ - # want to filter out rows where 'rejected_by_photometry' is 'True' - gaia_df.drop(gaia_df[gaia_df.rejected_by_photometry == True].index, inplace=True) # df.flux is in electron/s # already have the conversion from mag to mJy so go with that. Need to convert either way # generate magerr from fluxerr and flux - gaia_df["magerr"] = 2.5 / np.log(10) * gaia_df.flux_error / gaia_df.flux + gaia_df["mag"] = gaia_df.g_transit_mag + gaia_df["magerr"] = 2.5 / np.log(10) * gaia_df.g_transit_flux_error / gaia_df.g_transit_flux # compute flux and flux error in mJy gaia_df["flux_mJy"] = 10 ** (-0.4 * (gaia_df.mag - 23.9)) / 1e3 # in mJy gaia_df["fluxerr_mJy"] = gaia_df.magerr / 2.5 * np.log(10) * gaia_df.flux_mJy # in mJy # get time in mjd - gaia_df["time_mjd"] = gaia_df.time + 55197.5 - + gaia_df["time_mjd"] = gaia_df.g_transit_time + 55197.5 + + gaia_df["band"] = 'G' + # need to rename some columns for the MultiIndexDFObject - colmap = dict(flux_mJy="flux", fluxerr_mJy="err", time_mjd="time", objectid="objectid", label="label", band="band") + colmap = dict(flux_mJy="flux", fluxerr_mJy="err", time_mjd="time", + objectid="objectid", label="label", band="band") + # and only keep those columns that we need for the MultiIndexDFObject gaia_df = gaia_df[colmap.keys()].rename(columns=colmap) + # We need to flatted out the multidim columns. Also note, the dtype of these columns + # are not properly propagated when ``to_pandas()`` was called, we deal with this issue now. + + gaia_df = gaia_df.explode(['flux', 'err', 'time']) + gaia_df = gaia_df.astype({'flux': float, 'err': float}) + # return the light curves as a MultiIndexDFObject indexes, columns = ["objectid", "label", "band", "time"], ["flux", "err"] df_lc = MultiIndexDFObject(data=gaia_df.set_index(indexes)[columns]) - + return df_lc