fit_spectra.py

#!/usr/bin/env python
"""
This fits SpFrame spectra with airglow lines and a simple model for the continuum, 
including zodiacal light. The fit is done using a simple linear regression. 
After running the least squares fit, the fitted lines are separated from the continuum and 
the lines, continuum and residuals are returned independently.

INPUT: * SpFrame flux files as .npy files with wavelength and sky spectra 
       * Also need a list of airglow lines. these should be on the github repo

OUTPUT: numpy files identified in the same way as the sky_flux files with the plate number 
and file identifier "split_spectra". The numpy arrays have the following fields: WAVE, LINES, CONT, RESIDS

Before running, identify the directory that the spframe flux files are kept and where the airglow lines are
saved. Also identify where you want to save the files generated by this program (SAVE_DIR)

Title: SpFrame Flux Spectra Fit
Author: P. Fagrelius
Date: June, 2017

"""
import glob 
import os
from datetime import datetime
import multiprocessing
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.special import eval_legendre


class FitSpectra(object):
    def __init__(self):
        #####UPDATE THESE DIRECTORIES#######
        #Directory to save data
        self.SAVE_DIR = '/global/cscratch1/sd/parkerf/split_flux/blue_split/' 
        #Directory where all SpFrame flux files reside
        self.SPECTRA_DIR = '/global/cscratch1/sd/parkerf/sky_flux/'
        ####################################

        # Load spectra data
        self.SAVED_FILES = glob.glob(self.SAVE_DIR+"*_split_fit.npy")
        self.SPECTRA_FILES = glob.glob(self.SPECTRA_DIR+"*_calibrated_sky.npy")
        
        #Get metadata
        self.MetaData = np.load(self.SPECTRA_DIR+'meta_rich.npy')
        print("got MetaData")

        #Identify which data you want to look at
        #Options: test (10 total), blue, red, full
        self.ttype = 'blue'

    def run(self):
        self.get_plates_needed()
        self.get_airglow_spectra()
        self.get_vac_lines()
        print("got Airglow Lines")
    
        #Run script
        #for spectra_file in self.SPECTRA:
        #    self.fit_and_separate_spectra(spectra_file)

        pool2 = multiprocessing.Pool(processes=12)
        pool2.map(self.fit_and_separate_spectra, self.SPECTRA)
        pool2.terminate()

    
    def get_plates_needed(self):
        """This function compares what spectra have already been split so that if program
        stops working in the middle you don't have to start over"""

        Complete = [d[-18:-14] for d in self.SAVED_FILES]
        Total = [d[-23:-19] for d in self.SPECTRA_FILES]
        Needed_idx = [i for i, x in enumerate(Total) if x not in Complete]
        self.SPECTRA = [self.SPECTRA_FILES[x] for x in Needed_idx]
        print('Will be analyzing %d plate files' % len(Needed_idx))


    def get_airglow_spectra(self):
        """This function loads the airglow files from Cosby et al paper and changes the format"""

        self.AIRGLOW_DIR = os.getcwd()+'/AirglowSpectra/cosby/'
        AF = glob.glob(self.AIRGLOW_DIR+'/*.txt')
        AL = []
        for file in AF:
            data = pd.read_csv(file, delim_whitespace=True)
            d = data.to_records(index=False)
            AL.append(np.array(d))
        self.AirglowLines = np.hstack(AL)

    def get_vac_lines(self):
        """Gets only the airglow lines needed to make an appropriate fit. This could be
        different for the blue and red CCDs. """

        b_sig = np.where(self.AirglowLines['obs_eint'] > 5)
        bVL = self.air_to_vac(self.AirglowLines['obs_wave'])
        bVL = bVL[b_sig] #nm to A
        self.BlueVacLines = bVL[bVL < 700]

        r_sig = np.where(self.AirglowLines['obs_eint'] > 5)
        rVL = self.air_to_vac(self.AirglowLines['obs_wave'])
        rVL = rVL[r_sig] #nm to A
        self.RedVacLines = rVL[rVL > 560]


    def clean_spectra(self, spectrum):
        """Takes out all nan/inf so lstsq will run smoothly """

        ok = np.isfinite(spectrum['SKY'])

        self.wave = spectrum['WAVE'][ok]
        self.sky = spectrum['SKY'][ok]
        self.ivar = spectrum['IVAR'][ok]
        self.disp = spectrum['DISP'][ok]


    def air_to_vac(self, wave):
        """Index of refraction to go from wavelength in air to wavelength in vacuum
        Equation from (Edlen 1966)
        vac_wave = n*air_wave
        """
        #Convert to um
        wave_um = wave*.001
        ohm2 = (1./wave_um)**(2)

        #Calculate index at every wavelength
        nn = []
        for x in ohm2:
            n = 1+10**(-8)*(8342.13 + (2406030/float(130.-x)) + (15997/float(389-x)))
            nn.append(n)
        
        #Get new wavelength by multiplying by index of refraction
        vac_wave = nn*wave
        return vac_wave

    def airglow_line_components(self, vaclines, wave_range, disp_range):
        """ Takes each Airglow line included in the analysis and creates a gaussian profile 
        of the line. 
        INPUT: - List of airglow lines wanted to model
               - Wavelength range of the spectra
               - Wavelength dispersion for the wavelength range of the spectra
        OUTPUT: 
               Matrix with all lines used for linear regression. Size[len(wave_range),len(airglow_lines)]
        """

        AA = []
        for line in vaclines:
            AA.append(np.exp(-0.5*((wave_range-line)/disp_range)**2))
        return np.vstack(AA)


    def linear_model(self, spectrum, num_cont, vaclines):
        """ This is the heart of the program. It computes the linear model using Ordinary Least Squares. It then
        splits up the model into components: lines, continuum and residuals.
        The inputs characterize whether it is a blue or red sky fit. They have slightly different 
        continuum models and we break up the airglow lines so that it is faster.

        INPUTS:  - sky spectrum: from Boss Sky Spectra (generated by spframe_flux.py)
                 - num_cont: the number of polynomials used for the continuum fitting
                 - airglow_lines: the set of lines used to fit out the airglow. different for blue/red
        OUTPUTS: - wave_range: original wavelength range (generated by spframe_flux.py)
                 - lines: Airglow lines fitted
                 - cont: Continuum fitted
                 - res: Residuals between linear model and original sky flux
                 - R: R^2 value to determine goodness of fit
        """

        self.clean_spectra(spectrum)

        AA = self.airglow_line_components(vaclines, self.wave, self.disp)

        # Continuum model
        AC = []
        for i in range(num_cont):
            AC.append(eval_legendre(i, self.wave))
        AC = np.array(AC)
        A = np.stack(np.vstack((AC, AA)), axis=1)

        #Create model
        results = sm.OLS(self.sky, A).fit()
        params = results.params
        model = np.dot(A, params)

        #Separate
        cont = np.dot(A[:,0:num_cont], params[0:num_cont])
        lines = np.dot(A[:,num_cont:], params[num_cont:])
        res = self.sky - model
        
        #R^2
        R_1 = np.sum([(i)**2 for i in res])
        R_2 = np.sum([(i-np.mean(self.sky))**2 for i in self.sky])  
        R = 1-(R_1/R_2)   

        model = model
        A = A
        params = params

        return [self.wave, lines, cont, res, R]

    def get_specnos(self, spectra_file):
        """Function that runs the linear model and saves the output. The output file is a .npy file that includes
        the parts of the model (lines, continuum, and residuals) along with the wavelengths associated for ease in comparison.
        It also contains some meta data so that these linear models can be correlated with the actual flux.

        This funciton is used in a multiprocessing loop.
        """

        self.plate_num = spectra_file[-23:-19]
        print("Fitting spectra in plate %s" % self.plate_num)
        self.spectra = np.load(spectra_file)    
        self.this_plate = self.MetaData[self.MetaData['PLATE'] == int(self.plate_num)]

        if self.ttype == 'test':
            max_num = 10 
            self.specnos = self.this_plate[0: max_num]['SPECNO']
        elif self.ttype == 'blue':
            max_num = len(self.spectra)-1
            self.specnos = self.this_plate[(self.this_plate['CAMERAS'] == b'b1') | (self.this_plate['CAMERAS'] == b'b2')]['SPECNO']
        elif self.ttype == 'red':
            max_num = len(self.spectra)-1
            self.specnos = self.this_plate[(self.this_plate['CAMERAS'] == b'r1') | (self.this_plate['CAMERAS'] == b'r2')]['SPECNO']
        elif self.ttype == 'full':
            max_num = len(self.spectra)
            self.specnos = self.this_plate['SPECNO']
        else: 
            print("not a valid type. Going to test")
            max_num = 10 #len(spectra) Number of spectra in a given plate that you want to run this for. Mostly for debugging
            self.specnos = np.random.choice(self.this_plate['SPECNO'], size=max_num)

        self.spectra_length = len(self.specnos)

    def fit_and_separate_spectra(self, spectra_file):
        start = datetime.now()
        self.get_specnos(spectra_file)

        data = []
        for specno in self.specnos:
            d = self.fit_and_split_spectrum(specno)
            data.append(d)
        #pool = multiprocessing.Pool(processes=64)
        #data = pool.map(self.fit_and_split_spectrum, self.specnos)
        #pool.terminate()
        #data = np.vstack(data)
        np.save(self.SAVE_DIR+self.plate_num+'_split_fit',data)
        total_time = (datetime.now() - start).total_seconds()
        print("Total time for spectra %s: %f" %(self.plate_num,total_time))


    def fit_and_split_spectrum(self, specno):
        print('splitting spectra %d/%d for plate %s' % (specno, self.spectra_length, self.plate_num))
        this_obs = self.this_plate[self.this_plate['SPECNO'] == specno]

        if len(this_obs) > 1:
            this_obs = this_obs[0]
            print("This observation had more than one specno with that number")
        else:
            pass

        if (this_obs['CAMERAS'] == b'b1') | (this_obs['CAMERAS'] == b'b2'):
            model = self.linear_model(self.spectra[specno], 3, self.BlueVacLines)
        elif (this_obs['CAMERAS'] == b'r1') | (this_obs['CAMERAS'] == b'r2'):
            model = self.linear_model(self.spectra[specno], 2, self.RedVacLines)
        else:
            print("Don't recognize the camera")
            model = [0, 0, 0, 0, 0]
       
        model_fit = np.zeros(len(model[0]),dtype=[('PLATE','i4'), ('COLOR','S2'), ('SPECNO','i4'), ('WAVE','f8'),
                                                  ('LINES','f8'), ('CONT','f8'), ('RESIDS','f8'), ('R','f8')])
        model_fit['PLATE'] = self.plate_num
        model_fit['COLOR'] = this_obs['CAMERAS']
        model_fit['SPECNO'] = specno
        model_fit['WAVE'] = model[0]
        model_fit['LINES'] = model[1]
        model_fit['CONT'] = model[2]
        model_fit['RESIDS'] = model[3]
        model_fit['R'] = model[4]

        return model_fit


if __name__=="__main__":
    FT = FitSpectra()
    FT.run()