From f794524b030eab198ba1ddd4c556049e02c73b4c Mon Sep 17 00:00:00 2001 From: stvoutsin Date: Tue, 5 Apr 2022 18:28:51 +0300 Subject: [PATCH] Created iniital GaiaDMP project & source code --- README.md | 2 +- gaiadmpsetup/__init__.py | 1 + gaiadmpsetup/gaiadmpsetup.py | 668 +++++++++++++++++++++++++++++++++++ setup.py | 30 ++ 4 files changed, 700 insertions(+), 1 deletion(-) create mode 100644 gaiadmpsetup/__init__.py create mode 100644 gaiadmpsetup/gaiadmpsetup.py create mode 100644 setup.py diff --git a/README.md b/README.md index e367ff1..8ec07f4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # gaiadmpsetup -Python library used to create the Spark SQL tables required by GaiaDMP +GaiaDMP Setup library diff --git a/gaiadmpsetup/__init__.py b/gaiadmpsetup/__init__.py new file mode 100644 index 0000000..66f3b3f --- /dev/null +++ b/gaiadmpsetup/__init__.py @@ -0,0 +1 @@ +from . import gaiadmpsetup diff --git a/gaiadmpsetup/gaiadmpsetup.py b/gaiadmpsetup/gaiadmpsetup.py new file mode 100644 index 0000000..645095b --- /dev/null +++ b/gaiadmpsetup/gaiadmpsetup.py @@ -0,0 +1,668 @@ +from pyspark.sql.types import * +from pyspark.sql.session import SparkSession + +spark = SparkSession.builder.getOrCreate() + +class GaiaDMPSetup: + """ + Prepare the PySpark env for GaiaDMP + """ + + def __init__(self): + pass + + @staticmethod + def setup(): + + gaia_source_schema = StructType([ + StructField('solution_id', LongType(), True), + StructField('designation', StringType(), True), + StructField('source_id', LongType(), True), + StructField('random_index', LongType(), True), + StructField('ref_epoch', FloatType(), True), + StructField('ra', DoubleType(), True), + StructField('ra_error', DoubleType(), True), + StructField('dec', DoubleType(), True), + StructField('dec_error', DoubleType(), True), + StructField('parallax', DoubleType(), True), + StructField('parallax_error', DoubleType(), True), + StructField('parallax_over_error', FloatType(), True), + StructField('pm', FloatType(), True), + StructField('pmra', DoubleType(), True), + StructField('pmra_error', FloatType(), True), + StructField('pmdec', DoubleType(), True), + StructField('pmdec_error', FloatType(), True), + StructField('ra_dec_corr', FloatType(), True), + StructField('ra_parallax_corr', FloatType(), True), + StructField('ra_pmra_corr', FloatType(), True), + StructField('ra_pmdec_corr', FloatType(), True), + StructField('dec_parallax_corr', FloatType(), True), + StructField('dec_pmra_corr', FloatType(), True), + StructField('dec_pmdec_corr', FloatType(), True), + StructField('parallax_pmra_corr', FloatType(), True), + StructField('parallax_pmdec_corr', FloatType(), True), + StructField('pmra_pmdec_corr', FloatType(), True), + StructField('astrometric_n_obs_al', ShortType(), True), + StructField('astrometric_n_obs_ac', ShortType(), True), + StructField('astrometric_n_good_obs_al', ShortType(), True), + StructField('astrometric_n_bad_obs_al', ShortType(), True), + StructField('astrometric_gof_al', FloatType(), True), + StructField('astrometric_chi2_al', FloatType(), True), + StructField('astrometric_excess_noise', FloatType(), True), + StructField('astrometric_excess_noise_sig', FloatType(), True), + StructField('astrometric_params_solved', ShortType(), True), + StructField('astrometric_primary_flag', BooleanType(), True), + StructField('nu_eff_used_in_astrometry', FloatType(), True), + StructField('pseudocolour', FloatType(), True), + StructField('pseudocolour_error', FloatType(), True), + StructField('ra_pseudocolour_corr', FloatType(), True), + StructField('dec_pseudocolour_corr', FloatType(), True), + StructField('parallax_pseudocolour_corr', FloatType(), True), + StructField('pmra_pseudocolour_corr', FloatType(), True), + StructField('pmdec_pseudocolour_corr', FloatType(), True), + StructField('astrometric_matched_transits', ShortType(), True), + StructField('visibility_periods_used', ShortType(), True), + StructField('astrometric_sigma5d_max', FloatType(), True), + StructField('matched_transits', ShortType(), True), + StructField('new_matched_transits', ShortType(), True), + StructField('matched_transits_removed', ShortType(), True), + StructField('ipd_gof_harmonic_amplitude', FloatType(), True), + StructField('ipd_gof_harmonic_phase', FloatType(), True), + StructField('ipd_frac_multi_peak', ShortType(), True), + StructField('ipd_frac_odd_win', ShortType(), True), + StructField('ruwe', FloatType(), True), + StructField('scan_direction_strength_k1', FloatType(), True), + StructField('scan_direction_strength_k2', FloatType(), True), + StructField('scan_direction_strength_k3', FloatType(), True), + StructField('scan_direction_strength_k4', FloatType(), True), + StructField('scan_direction_mean_k1', FloatType(), True), + StructField('scan_direction_mean_k2', FloatType(), True), + StructField('scan_direction_mean_k3', FloatType(), True), + StructField('scan_direction_mean_k4', FloatType(), True), + StructField('duplicated_source', BooleanType(), True), + StructField('phot_g_n_obs', ShortType(), True), + StructField('phot_g_mean_flux', DoubleType(), True), + StructField('phot_g_mean_flux_error', FloatType(), True), + StructField('phot_g_mean_flux_over_error', FloatType(), True), + StructField('phot_g_mean_mag', FloatType(), True), + StructField('phot_bp_n_obs', ShortType(), True), + StructField('phot_bp_mean_flux', DoubleType(), True), + StructField('phot_bp_mean_flux_error', FloatType(), True), + StructField('phot_bp_mean_flux_over_error', FloatType(), True), + StructField('phot_bp_mean_mag', FloatType(), True), + StructField('phot_rp_n_obs', ShortType(), True), + StructField('phot_rp_mean_flux', DoubleType(), True), + StructField('phot_rp_mean_flux_error', FloatType(), True), + StructField('phot_rp_mean_flux_over_error', FloatType(), True), + StructField('phot_rp_mean_mag', FloatType(), True), + StructField('phot_bp_n_contaminated_transits', ShortType(), True), + StructField('phot_bp_n_blended_transits', ShortType(), True), + StructField('phot_rp_n_contaminated_transits', ShortType(), True), + StructField('phot_rp_n_blended_transits', ShortType(), True), + StructField('phot_proc_mode', ShortType(), True), + StructField('phot_bp_rp_excess_factor', FloatType(), True), + StructField('bp_rp', FloatType(), True), + StructField('bp_g', FloatType(), True), + StructField('g_rp', FloatType(), True), + StructField('dr2_radial_velocity', FloatType(), True), + StructField('dr2_radial_velocity_error', FloatType(), True), + StructField('dr2_rv_nb_transits', ShortType(), True), + StructField('dr2_rv_template_teff', FloatType(), True), + StructField('dr2_rv_template_logg', FloatType(), True), + StructField('dr2_rv_template_fe_h', FloatType(), True), + StructField('l', DoubleType(), True), + StructField('b', DoubleType(), True), + StructField('ecl_lon', DoubleType(), True), + StructField('ecl_lat', DoubleType(), True), + ]) + + twomass_psc_schema = StructType([ + StructField('ra', DoubleType(), True), + StructField('dec', DoubleType(), True), + StructField('err_maj', FloatType(), True), + StructField('err_min', FloatType(), True), + StructField('err_ang', ShortType(), True), + StructField('designation', StringType(), True), + StructField('j_m', FloatType(), True), + StructField('j_cmsig', FloatType(), True), + StructField('j_msigcom', FloatType(), True), + StructField('j_snr', FloatType(), True), + StructField('h_m', FloatType(), True), + StructField('h_cmsig', FloatType(), True), + StructField('h_msigcom', FloatType(), True), + StructField('h_snr', FloatType(), True), + StructField('k_m', FloatType(), True), + StructField('k_cmsig', FloatType(), True), + StructField('k_msigcom', FloatType(), True), + StructField('k_snr', FloatType(), True), + StructField('ph_qual', StringType(), True), + StructField('rd_flg', StringType(), True), + StructField('bl_flg', StringType(), True), + StructField('cc_flg', StringType(), True), + StructField('ndet', StringType(), True), + StructField('prox', FloatType(), True), + StructField('pxpa', ShortType(), True), + StructField('pxcntr', IntegerType(), True), + StructField('gal_contam', ShortType(), True), + StructField('mp_flg', ShortType(), True), + StructField('pts_key', IntegerType(), True), + StructField('hemis', StringType(), True), + StructField('date', DateType(), True), + StructField('scan', ShortType(), True), + StructField('glon', FloatType(), True), + StructField('glat', FloatType(), True), + StructField('x_scan', FloatType(), True), + StructField('jdate', DoubleType(), True), + StructField('j_psfchi', FloatType(), True), + StructField('h_psfchi', FloatType(), True), + StructField('k_psfchi', FloatType(), True), + StructField('j_m_stdap', FloatType(), True), + StructField('j_msig_stdap', FloatType(), True), + StructField('h_m_stdap', FloatType(), True), + StructField('h_msig_stdap', FloatType(), True), + StructField('k_m_stdap', FloatType(), True), + StructField('k_msig_stdap', FloatType(), True), + StructField('dist_edge_ns', IntegerType(), True), + StructField('dist_edge_ew', IntegerType(), True), + StructField('dist_edge_flg', StringType(), True), + StructField('dup_src', ShortType(), True), + StructField('use_src', ShortType(), True), + StructField('a', StringType(), True), + StructField('dist_opt', FloatType(), True), + StructField('phi_opt', ShortType(), True), + StructField('b_m_opt', FloatType(), True), + StructField('vr_m_opt', FloatType(), True), + StructField('nopt_mchs', ShortType(), True), + StructField('ext_key', IntegerType(), True), + StructField('scan_key', IntegerType(), True), + StructField('coadd_key', IntegerType(), True), + StructField('coadd', ShortType(), True), + ]) + + allwise_sc_schema = StructType([ + StructField('designation', StringType(), True), + StructField('ra', DoubleType(), True), + StructField('dec', DoubleType(), True), + StructField('sigra', DoubleType(), True), + StructField('sigdec', DoubleType(), True), + StructField('sigradec', DoubleType(), True), + StructField('glon', DoubleType(), True), + StructField('glat', DoubleType(), True), + StructField('elon', DoubleType(), True), + StructField('elat', DoubleType(), True), + StructField('wx', DoubleType(), True), + StructField('wy', DoubleType(), True), + StructField('cntr', LongType(), True), + StructField('src_id', StringType(), True), + StructField('coadd_id', StringType(), True), + StructField('src', IntegerType(), True), + StructField('w1mpro', DoubleType(), True), + StructField('w1sigmpro', DoubleType(), True), + StructField('w1snr', DoubleType(), True), + StructField('w1rchi2', FloatType(), True), + StructField('w2mpro', DoubleType(), True), + StructField('w2sigmpro', DoubleType(), True), + StructField('w2snr', DoubleType(), True), + StructField('w2rchi2', FloatType(), True), + StructField('w3mpro', DoubleType(), True), + StructField('w3sigmpro', DoubleType(), True), + StructField('w3snr', DoubleType(), True), + StructField('w3rchi2', FloatType(), True), + StructField('w4mpro', DoubleType(), True), + StructField('w4sigmpro', DoubleType(), True), + StructField('w4snr', DoubleType(), True), + StructField('w4rchi2', FloatType(), True), + StructField('rchi2', FloatType(), True), + StructField('nb', IntegerType(), True), + StructField('na', IntegerType(), True), + StructField('w1sat', DoubleType(), True), + StructField('w2sat', DoubleType(), True), + StructField('w3sat', DoubleType(), True), + StructField('w4sat', DoubleType(), True), + StructField('satnum', StringType(), True), + StructField('ra_pm', DoubleType(), True), + StructField('dec_pm', DoubleType(), True), + StructField('sigra_pm', DoubleType(), True), + StructField('sigdec_pm', DoubleType(), True), + StructField('sigradec_pm', DoubleType(), True), + StructField('pmra', IntegerType(), True), + StructField('sigpmra', IntegerType(), True), + StructField('pmdec', IntegerType(), True), + StructField('sigpmdec', IntegerType(), True), + StructField('w1rchi2_pm', FloatType(), True), + StructField('w2rchi2_pm', FloatType(), True), + StructField('w3rchi2_pm', FloatType(), True), + StructField('w4rchi2_pm', FloatType(), True), + StructField('rchi2_pm', FloatType(), True), + StructField('pmcode', StringType(), True), + StructField('cc_flags', StringType(), True), + StructField('rel', StringType(), True), + StructField('ext_flg', IntegerType(), True), + StructField('var_flg', StringType(), True), + StructField('ph_qual', StringType(), True), + StructField('det_bit', IntegerType(), True), + StructField('moon_lev', StringType(), True), + StructField('w1nm', IntegerType(), True), + StructField('w1m', IntegerType(), True), + StructField('w2nm', IntegerType(), True), + StructField('w2m', IntegerType(), True), + StructField('w3nm', IntegerType(), True), + StructField('w3m', IntegerType(), True), + StructField('w4nm', IntegerType(), True), + StructField('w4m', IntegerType(), True), + StructField('w1cov', DoubleType(), True), + StructField('w2cov', DoubleType(), True), + StructField('w3cov', DoubleType(), True), + StructField('w4cov', DoubleType(), True), + StructField('w1cc_map', IntegerType(), True), + StructField('w1cc_map_str', StringType(), True), + StructField('w2cc_map', IntegerType(), True), + StructField('w2cc_map_str', StringType(), True), + StructField('w3cc_map', IntegerType(), True), + StructField('w3cc_map_str', StringType(), True), + StructField('w4cc_map', IntegerType(), True), + StructField('w4cc_map_str', StringType(), True), + StructField('best_use_cntr', LongType(), True), + StructField('ngrp', ShortType(), True), + StructField('w1flux', FloatType(), True), + StructField('w1sigflux', FloatType(), True), + StructField('w1sky', DoubleType(), True), + StructField('w1sigsk', DoubleType(), True), + StructField('w1conf', DoubleType(), True), + StructField('w2flux', FloatType(), True), + StructField('w2sigflux', FloatType(), True), + StructField('w2sky', DoubleType(), True), + StructField('w2sigsk', DoubleType(), True), + StructField('w2conf', DoubleType(), True), + StructField('w3flux', FloatType(), True), + StructField('w3sigflux', FloatType(), True), + StructField('w3sky', DoubleType(), True), + StructField('w3sigsk', DoubleType(), True), + StructField('w3conf', DoubleType(), True), + StructField('w4flux', FloatType(), True), + StructField('w4sigflux', FloatType(), True), + StructField('w4sky', DoubleType(), True), + StructField('w4sigsk', DoubleType(), True), + StructField('w4conf', DoubleType(), True), + StructField('w1mag', DoubleType(), True), + StructField('w1sigm', DoubleType(), True), + StructField('w1flg', IntegerType(), True), + StructField('w1mcor', DoubleType(), True), + StructField('w2mag', DoubleType(), True), + StructField('w2sigm', DoubleType(), True), + StructField('w2flg', IntegerType(), True), + StructField('w2mcor', DoubleType(), True), + StructField('w3mag', DoubleType(), True), + StructField('w3sigm', DoubleType(), True), + StructField('w3flg', IntegerType(), True), + StructField('w3mcor', DoubleType(), True), + StructField('w4mag', DoubleType(), True), + StructField('w4sigm', DoubleType(), True), + StructField('w4flg', IntegerType(), True), + StructField('w4mcor', DoubleType(), True), + StructField('w1mag_1', DoubleType(), True), + StructField('w1sigm_1', DoubleType(), True), + StructField('w1flg_1', IntegerType(), True), + StructField('w2mag_1', DoubleType(), True), + StructField('w2sigm_1', DoubleType(), True), + StructField('w2flg_1', IntegerType(), True), + StructField('w3mag_1', DoubleType(), True), + StructField('w3sigm_1', DoubleType(), True), + StructField('w3flg_1', IntegerType(), True), + StructField('w4mag_1', DoubleType(), True), + StructField('w4sigm_1', DoubleType(), True), + StructField('w4flg_1', IntegerType(), True), + StructField('w1mag_2', DoubleType(), True), + StructField('w1sigm_2', DoubleType(), True), + StructField('w1flg_2', IntegerType(), True), + StructField('w2mag_2', DoubleType(), True), + StructField('w2sigm_2', DoubleType(), True), + StructField('w2flg_2', IntegerType(), True), + StructField('w3mag_2', DoubleType(), True), + StructField('w3sigm_2', DoubleType(), True), + StructField('w3flg_2', IntegerType(), True), + StructField('w4mag_2', DoubleType(), True), + StructField('w4sigm_2', DoubleType(), True), + StructField('w4flg_2', IntegerType(), True), + StructField('w1mag_3', DoubleType(), True), + StructField('w1sigm_3', DoubleType(), True), + StructField('w1flg_3', IntegerType(), True), + StructField('w2mag_3', DoubleType(), True), + StructField('w2sigm_3', DoubleType(), True), + StructField('w2flg_3', IntegerType(), True), + StructField('w3mag_3', DoubleType(), True), + StructField('w3sigm_3', DoubleType(), True), + StructField('w3flg_3', IntegerType(), True), + StructField('w4mag_3', DoubleType(), True), + StructField('w4sigm_3', DoubleType(), True), + StructField('w4flg_3', IntegerType(), True), + StructField('w1mag_4', DoubleType(), True), + StructField('w1sigm_4', DoubleType(), True), + StructField('w1flg_4', IntegerType(), True), + StructField('w2mag_4', DoubleType(), True), + StructField('w2sigm_4', DoubleType(), True), + StructField('w2flg_4', IntegerType(), True), + StructField('w3mag_4', DoubleType(), True), + StructField('w3sigm_4', DoubleType(), True), + StructField('w3flg_4', IntegerType(), True), + StructField('w4mag_4', DoubleType(), True), + StructField('w4sigm_4', DoubleType(), True), + StructField('w4flg_4', IntegerType(), True), + StructField('w1mag_5', DoubleType(), True), + StructField('w1sigm_5', DoubleType(), True), + StructField('w1flg_5', IntegerType(), True), + StructField('w2mag_5', DoubleType(), True), + StructField('w2sigm_5', DoubleType(), True), + StructField('w2flg_5', IntegerType(), True), + StructField('w3mag_5', DoubleType(), True), + StructField('w3sigm_5', DoubleType(), True), + StructField('w3flg_5', IntegerType(), True), + StructField('w4mag_5', DoubleType(), True), + StructField('w4sigm_5', DoubleType(), True), + StructField('w4flg_5', IntegerType(), True), + StructField('w1mag_6', DoubleType(), True), + StructField('w1sigm_6', DoubleType(), True), + StructField('w1flg_6', IntegerType(), True), + StructField('w2mag_6', DoubleType(), True), + StructField('w2sigm_6', DoubleType(), True), + StructField('w2flg_6', IntegerType(), True), + StructField('w3mag_6', DoubleType(), True), + StructField('w3sigm_6', DoubleType(), True), + StructField('w3flg_6', IntegerType(), True), + StructField('w4mag_6', DoubleType(), True), + StructField('w4sigm_6', DoubleType(), True), + StructField('w4flg_6', IntegerType(), True), + StructField('w1mag_7', DoubleType(), True), + StructField('w1sigm_7', DoubleType(), True), + StructField('w1flg_7', IntegerType(), True), + StructField('w2mag_7', DoubleType(), True), + StructField('w2sigm_7', DoubleType(), True), + StructField('w2flg_7', IntegerType(), True), + StructField('w3mag_7', DoubleType(), True), + StructField('w3sigm_7', DoubleType(), True), + StructField('w3flg_7', IntegerType(), True), + StructField('w4mag_7', DoubleType(), True), + StructField('w4sigm_7', DoubleType(), True), + StructField('w4flg_7', IntegerType(), True), + StructField('w1mag_8', DoubleType(), True), + StructField('w1sigm_8', DoubleType(), True), + StructField('w1flg_8', IntegerType(), True), + StructField('w2mag_8', DoubleType(), True), + StructField('w2sigm_8', DoubleType(), True), + StructField('w2flg_8', IntegerType(), True), + StructField('w3mag_8', DoubleType(), True), + StructField('w3sigm_8', DoubleType(), True), + StructField('w3flg_8', IntegerType(), True), + StructField('w4mag_8', DoubleType(), True), + StructField('w4sigm_8', DoubleType(), True), + StructField('w4flg_8', IntegerType(), True), + StructField('w1magp', DoubleType(), True), + StructField('w1sigp1', DoubleType(), True), + StructField('w1sigp2', DoubleType(), True), + StructField('w1k', DoubleType(), True), + StructField('w1ndf', IntegerType(), True), + StructField('w1mlq', DoubleType(), True), + StructField('w1mjdmin', DoubleType(), True), + StructField('w1mjdmax', DoubleType(), True), + StructField('w1mjdmean', DoubleType(), True), + StructField('w2magp', DoubleType(), True), + StructField('w2sigp1', DoubleType(), True), + StructField('w2sigp2', DoubleType(), True), + StructField('w2k', DoubleType(), True), + StructField('w2ndf', IntegerType(), True), + StructField('w2mlq', DoubleType(), True), + StructField('w2mjdmin', DoubleType(), True), + StructField('w2mjdmax', DoubleType(), True), + StructField('w2mjdmean', DoubleType(), True), + StructField('w3magp', DoubleType(), True), + StructField('w3sigp1', DoubleType(), True), + StructField('w3sigp2', DoubleType(), True), + StructField('w3k', DoubleType(), True), + StructField('w3ndf', IntegerType(), True), + StructField('w3mlq', DoubleType(), True), + StructField('w3mjdmin', DoubleType(), True), + StructField('w3mjdmax', DoubleType(), True), + StructField('w3mjdmean', DoubleType(), True), + StructField('w4magp', DoubleType(), True), + StructField('w4sigp1', DoubleType(), True), + StructField('w4sigp2', DoubleType(), True), + StructField('w4k', DoubleType(), True), + StructField('w4ndf', IntegerType(), True), + StructField('w4mlq', DoubleType(), True), + StructField('w4mjdmin', DoubleType(), True), + StructField('w4mjdmax', DoubleType(), True), + StructField('w4mjdmean', DoubleType(), True), + StructField('rho12', IntegerType(), True), + StructField('rho23', IntegerType(), True), + StructField('rho34', IntegerType(), True), + StructField('q12', IntegerType(), True), + StructField('q23', IntegerType(), True), + StructField('q34', IntegerType(), True), + StructField('xscprox', DoubleType(), True), + StructField('w1rsemi', DoubleType(), True), + StructField('w1ba', DoubleType(), True), + StructField('w1pa', DoubleType(), True), + StructField('w1gmag', DoubleType(), True), + StructField('w1gerr', DoubleType(), True), + StructField('w1gflg', IntegerType(), True), + StructField('w2rsemi', DoubleType(), True), + StructField('w2ba', DoubleType(), True), + StructField('w2pa', DoubleType(), True), + StructField('w2gmag', DoubleType(), True), + StructField('w2gerr', DoubleType(), True), + StructField('w2gflg', IntegerType(), True), + StructField('w3rsemi', DoubleType(), True), + StructField('w3ba', DoubleType(), True), + StructField('w3pa', DoubleType(), True), + StructField('w3gmag', DoubleType(), True), + StructField('w3gerr', DoubleType(), True), + StructField('w3gflg', IntegerType(), True), + StructField('w4rsemi', DoubleType(), True), + StructField('w4ba', DoubleType(), True), + StructField('w4pa', DoubleType(), True), + StructField('w4gmag', DoubleType(), True), + StructField('w4gerr', DoubleType(), True), + StructField('w4gflg', IntegerType(), True), + StructField('tmass_key', IntegerType(), True), + StructField('r_2mass', DoubleType(), True), + StructField('pa_2mass', DoubleType(), True), + StructField('n_2mass', IntegerType(), True), + StructField('j_m_2mass', DoubleType(), True), + StructField('j_msig_2mass', DoubleType(), True), + StructField('h_m_2mass', DoubleType(), True), + StructField('h_msig_2mass', DoubleType(), True), + StructField('k_m_2mass', DoubleType(), True), + StructField('k_msig_2mass', DoubleType(), True), + StructField('x', DoubleType(), True), + StructField('y', DoubleType(), True), + StructField('z', DoubleType(), True), + StructField('spt_ind', IntegerType(), True), + StructField('htm20', LongType(), True), + StructField('spare', BooleanType(), True) + ]) + # N.B. that last column is the easiest way to work around the IPAC files having a trailing "|" at the end of each record which is interpereted as another column... d'oh + + panstarrs_dr1_otmo_schema = StructType([ + StructField('obj_id', LongType(), True), + StructField('projection_id', ShortType(), True), + StructField('sky_cell_id', ShortType(), True), + StructField('obj_info_flag', IntegerType(), True), + StructField('quality_flag', ShortType(), True), + StructField('ra_mean', DoubleType(), True), + StructField('dec_mean', DoubleType(), True), + StructField('ra_mean_err', FloatType(), True), + StructField('dec_mean_err', FloatType(), True), + StructField('epoch_mean', DoubleType(), True), + StructField('n_stack_detections', ShortType(), True), + StructField('n_detections', ShortType(), True), + StructField('ng', ShortType(), True), + StructField('nr', ShortType(), True), + StructField('ni', ShortType(), True), + StructField('nz', ShortType(), True), + StructField('ny', ShortType(), True), + StructField('g_qf_perfect', FloatType(), True), + StructField('g_mean_psf_mag', FloatType(), True), + StructField('g_mean_psf_mag_err', FloatType(), True), + StructField('g_mean_psf_mag_std', FloatType(), True), + StructField('g_mean_psf_mag_npt', ShortType(), True), + StructField('g_mean_psf_mag_min', FloatType(), True), + StructField('g_mean_psf_mag_max', FloatType(), True), + StructField('g_mean_kron_mag', FloatType(), True), + StructField('g_mean_kron_mag_err', FloatType(), True), + StructField('g_flags', IntegerType(), True), + StructField('r_qf_perfect', FloatType(), True), + StructField('r_mean_psf_mag', FloatType(), True), + StructField('r_mean_psf_mag_err', FloatType(), True), + StructField('r_mean_psf_mag_std', FloatType(), True), + StructField('r_mean_psf_mag_npt', ShortType(), True), + StructField('r_mean_psf_mag_min', FloatType(), True), + StructField('r_mean_psf_mag_max', FloatType(), True), + StructField('r_mean_kron_mag', FloatType(), True), + StructField('r_mean_kron_mag_err', FloatType(), True), + StructField('r_flags', IntegerType(), True), + StructField('i_qf_perfect', FloatType(), True), + StructField('i_mean_psf_mag', FloatType(), True), + StructField('i_mean_psf_mag_err', FloatType(), True), + StructField('i_mean_psf_mag_std', FloatType(), True), + StructField('i_mean_psf_mag_npt', ShortType(), True), + StructField('i_mean_psf_mag_min', FloatType(), True), + StructField('i_mean_psf_mag_max', FloatType(), True), + StructField('i_mean_kron_mag', FloatType(), True), + StructField('i_mean_kron_mag_err', FloatType(), True), + StructField('i_flags', IntegerType(), True), + StructField('z_qf_perfect', FloatType(), True), + StructField('z_mean_psf_mag', FloatType(), True), + StructField('z_mean_psf_mag_err', FloatType(), True), + StructField('z_mean_psf_mag_std', FloatType(), True), + StructField('z_mean_psf_mag_npt', ShortType(), True), + StructField('z_mean_psf_mag_min', FloatType(), True), + StructField('z_mean_psf_mag_max', FloatType(), True), + StructField('z_mean_kron_mag', FloatType(), True), + StructField('z_mean_kron_mag_err', FloatType(), True), + StructField('z_flags', IntegerType(), True), + StructField('y_qf_perfect', FloatType(), True), + StructField('y_mean_psf_mag', FloatType(), True), + StructField('y_mean_psf_mag_err', FloatType(), True), + StructField('y_mean_psf_mag_std', FloatType(), True), + StructField('y_mean_psf_mag_npt', ShortType(), True), + StructField('y_mean_psf_mag_min', FloatType(), True), + StructField('y_mean_psf_mag_max', FloatType(), True), + StructField('y_mean_kron_mag', FloatType(), True), + StructField('y_mean_kron_mag_err', FloatType(), True), + StructField('y_flags', IntegerType(), True), + ]) + + # crossmatch table schemas: + + # PS1 + panstarrs1_best_neighbour_schema = StructType([ + StructField('source_id', LongType(), True), + StructField('clean_panstarrs1_oid', LongType(), True), + StructField('original_ext_source_id', LongType(), True), + StructField('angular_distance', FloatType(), True), + StructField('number_of_neighbours', ByteType(), True), + StructField('number_of_mates', ByteType(), True), + StructField('xm_flag', ShortType(), True), + ]) + + # ALLWISE: + allwise_best_neighbour_schema = StructType([ + StructField('source_id', LongType(), True), + StructField('original_ext_source_id', StringType(), True), + StructField('angular_distance', FloatType(), True), + StructField('xm_flag', ShortType(), True), + StructField('allwise_oid', IntegerType(), True), + StructField('number_of_neighbours', ByteType(), True), + StructField('number_of_mates', ByteType(), True), + ]) + + # 2MASS: + tmasspscxsc_best_neighbour_schema = StructType([ + StructField('source_id', LongType(), True), + StructField('original_ext_source_id', StringType(), True), + StructField('angular_distance', FloatType(), True), + StructField('xm_flag', ShortType(), True), + StructField('clean_tmass_psc_xsc_oid', IntegerType(), True), + StructField('number_of_neighbours', ByteType(), True), + StructField('number_of_mates', ByteType(), True), + ]) + + # number of buckets for our platform + NUM_BUCKETS = 2048 + + # and to re-establish the resource in a new (or reset) spark context: + def reattachParquetFileResourceToSparkContext(table_name, file_path, *schema_structures): + """ + Creates a Spark (in-memory) meta-record for the table resource specified for querying + through the PySpark SQL API. + + Assumes that the table contains the Gaia source_id attribute and that the files have + been previously partitioned, bucketed and sorted on this field in parquet format + - see function saveToBinnedParquet(). If the table name specified already exists in the + catalogue IT WILL BE REMOVED (but the underlying data, assumed external, will remain). + + Parameters + ---------- + table_name : str + The table name to be used as the identifier in SQL queries etc. + file_path : str + The full disk file system path name to the folder containing the parquet file set. + schema_structures : StructType + One or more schema structures expressed as a StructType object containing a list of + StructField(field_name : str, type : data_type : DataType(), nullable : boolean) + """ + + # put in the columns and their data types ... + table_create_statement = "CREATE TABLE `" + table_name + "` (" + for schema_structure in schema_structures: + for field in schema_structure: + table_create_statement += "`" + field.name + "` " + field.dataType.simpleString() + "," + # ... zapping that extraneous comma at the end + table_create_statement = table_create_statement[:-1] + + # append the organisational details + table_create_statement += ") USING parquet OPTIONS (path '" + file_path + "') " + table_create_statement += "CLUSTERED BY (source_id) SORTED BY (source_id) INTO %d" % ( + NUM_BUCKETS) + " BUCKETS" + + + # scrub any existing record - N.B. tables defined in this way are EXTERNAL, so this statement will not scrub + # the underlying file data set. Also if the table doesn't exist, this will silently do nothing (no exception + # will be thrown). + spark.sql("DROP TABLE IF EXISTS " + table_name) + + # create the table resource + spark.sql(table_create_statement) + + def tablesExist(): + actual_tables = [i.name for i in spark.catalog.listTables()] + expected_tables = ['gaia_source', 'gaia_source_allwise_best_neighbours', 'gaia_source_ps1_best_neighbours', 'gaia_source_tmasspsc_best_neighbours'] + check = all(item in actual_tables for item in expected_tables) + return check + + if not tablesExist(): + # database name to create + database = "gaiaedr3" + + # root data store path: TODO change this to the official one when established. + data_store = "file:////data/gaia/GEDR3/" # "file:////user/nch/PARQUET/REPARTITIONED/" + + # create the database and switch the current SQL database context to it (from default) + spark.sql("create database " + database) + spark.sql("use " + database) + + # create the tables against their corresponding file sets and schema + reattachParquetFileResourceToSparkContext("gaia_source", data_store + "GEDR3_GAIASOURCE", gaia_source_schema) + reattachParquetFileResourceToSparkContext("gaia_source_tmasspsc_best_neighbours", + data_store + "GEDR3_2MASSPSC_BEST_NEIGHBOURS", + tmasspscxsc_best_neighbour_schema, twomass_psc_schema) + reattachParquetFileResourceToSparkContext("gaia_source_allwise_best_neighbours", + data_store + "GEDR3_ALLWISE_BEST_NEIGHBOURS", + allwise_best_neighbour_schema, twomass_psc_schema) + reattachParquetFileResourceToSparkContext("gaia_source_ps1_best_neighbours", + data_store + "GEDR3_PS1_BEST_NEIGHBOURS", + panstarrs1_best_neighbour_schema, panstarrs_dr1_otmo_schema) + + + +GaiaDMPSetup.setup() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d8e70e0 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +import os +from setuptools import setup + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +setup( + name = "gaiadmpsetup", + version = "0.1.1", + author = "Stelios Voutsinas", + author_email = "stv@roe.ac.uk", + description = ("A setup script for Gaia DMP"), + license = "BSD", + keywords = "gaiadmpsetup", + url = "https://github.com/wfau/aglais", + include_package_data = True, + packages=['gaiadmpsetup'], + long_description="README", + long_description_content_type='text/markdown', + classifiers=[ + "Development Status :: 4 - Beta", + "License :: Free For Home Use", + "Programming Language :: Python" + ] +) +