From d053c8854227d5994a53406d481cc2db92288c1e Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Fri, 20 Oct 2023 10:47:56 -0500
Subject: [PATCH 01/75] basic code to launch bte in tps code

---
 src/tps-time-loop.py | 266 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 239 insertions(+), 27 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 9bbc719d0..51f47379c 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -1,36 +1,244 @@
 #!/usr/bin/env python3
 import sys
 import os
+from mpi4py import MPI
 import numpy as np
+import scipy.constants
+import csv
+import matplotlib.pyplot as plt
 
-from mpi4py import MPI
+# set path to C++ TPS library
+path = os.path.abspath(os.path.dirname(sys.argv[0]))
+sys.path.append(path + "/.libs")
+sys.path.append(path + "/../../boltzmann/BESolver/python")
+import libtps
+from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
+import cupy as cp
 
-class BoltzmannMockSolver:
-    def __init__(self):
-        pass
+class BoltzmannSolverParams():
+    sp_order      = 8           # B-spline order in v-space
+    spline_qpts   = 10          # number of Gauss-Legendre quadrature points per knot interval    
+    Nr            = 127         # number of B-splines used in radial direction
+    l_max         = 1           # spherical modes uses, 0, to l_max
+    ev_max        = 16          # v-space grid truncation (eV)
+    n_grids       = 1           # number of v-space grids
+
+    dt            = 1e-2        # [] non-dimentionalized time w.r.t. oscilation period
+    cycles        = 10          # number of max cycles to evolve
+    solver_type   = "transient" # two modes, "transient" or "steady-state"
+    atol          = 1e-16       # absolute tolerance
+    rtol          = 1e-12       # relative tolerance
+    max_iter      = 1000         # max iterations for the newton solver
 
+    ee_collisions = 0           # enable electron-electron Coulombic effects
+    use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
+    dev_id        = 0           # which GPU device to use only used when use_gpu=1
+
+    collisions    = ["g0","g2"] # collision string g0-elastic, g2-ionization
+    export_csv    = 1           # export the qois to csv file
+    plot_data     = 1
+    
+    Efreq         = 0.0 #[1/s]  # E-field osicllation frequency
+    verbose       = 1           # verbose output for the BTE solver
+    n_pts         = 10          # number of spatial points to launch the BTE solver
+    Te            = 0.5 #[eV]   # approximate electron temperature
+    
+    threads       = 16          # number of threads to use to assemble operators
+    grid_idx      = 0
+    
+    output_dir    = "batched_bte"
+    out_fname     = output_dir + "/tps"
+    
+    # some useful units and conversion factors. 
+    ev_to_K       = (scipy.constants.electron_volt/scipy.constants.Boltzmann) 
+    Td_fac        = 1e-21 #[Vm^2]
+    
+class TPSINDEX():
+    """
+    simple index map to differnt fields, from the TPS arrays
+    """
+    ION_IDX = 0                         # ion      density index
+    ELE_IDX = 1                         # electron density index
+    NEU_IDX = 2                         # neutral  density index
+    
+    EF_RE_IDX = 0                       # Re(E) index
+    EF_IM_IDX = 1                       # Im(E) index
+    
+class Boltzmann0D2VBactchedSolver:
+    def __init__(self, tps):
+        self.tps   = tps
+        self.param = BoltzmannSolverParams()
+        # overide the default params, based on the config.ini file.
+        self.param.Efreq = 0#tps.getRequiredInput("em/current_frequency")
+        self.param.solver_type = "steady-state"
+        #self.param.n_pts       = 10
+        
+        lm_modes        = [[[l,0] for l in range(self.param.l_max+1)]]
+        nr              = np.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
+        
+        Te              = np.ones(self.param.n_grids) * self.param.Te 
+        ev_max          = np.ones(self.param.n_grids) * self.param.ev_max
+        self.bte_solver = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
+        
+        # compute BTE operators
+        grid_idx        = self.param.grid_idx
+        self.bte_solver.assemble_operators(grid_idx)
+        
     def fetch(self, interface):
-        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False)
-        efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
-        heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        grid_idx          = self.param.grid_idx
+        Tg                = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=True)
+        tps_npts          = len(Tg)
+        
+        Te                = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=True)
+        rr                = np.array(interface.HostRead(libtps.t2bIndex.ReactionRates), copy=True)
+        efield            = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=True).reshape((2, tps_npts))
+        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=True).reshape(3, tps_npts)
+        
+        bte_idx           = Te > (0.4 * self.param.ev_to_K)
+        self.param.n_pts  = len(Te[bte_idx])
+        
+        ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
+        ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
+        n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
+        Tg                = Tg[bte_idx]
+        Te                = Te[bte_idx]
+        
+        ne[ne<0]          = 1e-16
+        ni[ni<0]          = 1e-16
+        
+        eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
+        eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
+        eMag              = np.sqrt(eRe**2 + eIm **2)
+        eByn0             = eMag/n0/self.param.Td_fac
+        
+        
+        if self.param.verbose == 1 :
+            print("Boltzmann Solver Inputs")
+            print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+            print("n_pts = %d" % self.param.n_pts)
+            # idx0 = np.argmin(eByn0)
+            # idx1 = np.argmax(eByn0)
+            # print("E/n0  (min)               = %.12E [Td]     \t E/n0 (max) = %.12E [Td]    "%(eByn0[idx0], eByn0[idx1]))
+            # print("at E/n0 min max, Tg       = %.12E [K]      \t Tg         = %.12E [K]     "%(Tg[idx0], Tg[idx1]))
+            # print("at E/n0 min max, Te       = %.12E [K]      \t Te         = %.12E [K]     "%(Te[idx0], Te[idx1]))
+            
+            # print("at E/n0 min max, ne       = %.12E [1/m^3]  \t ne         = %.12E [1/m^3] "%(ne[idx0], ne[idx1]))
+            # print("at E/n0 min max, ni       = %.12E [1/m^3]  \t ni         = %.12E [1/m^3] "%(ni[idx0], ni[idx1]))
+            # print("at E/n0 min max, n0       = %.12E [1/m^3]  \t n0         = %.12E [1/m^3] "%(n0[idx0], n0[idx1]))
+            print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+            print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
+            print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
+            
+            print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
+            print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
+            print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
+            
+        
+        self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
+        self.bte_f0    = self.bte_solver.initialize(0, self.param.n_pts, "maxwellian")
+        if self.param.Efreq == 0:
+            ef_t = lambda t : eMag
+        else:
+            ef_t = lambda t : eRe * np.cos(2 * np.pi * self.param.Efreq * t) + eIm * np.sin(2 * np.pi * self.param.Efreq * t)
 
-        print("|| species_densities ||_2 = ", np.linalg.norm(species_densities) )
-        print("|| efield ||_2 = ", np.linalg.norm(efield) )
-        print("||heavy_temperature||_2 = ", np.linalg.norm(heavy_temperature) )
+        if self.param.use_gpu==1:
+            dev_id   = self.param.dev_id
+            self.bte_solver.host_to_device_setup(dev_id, 0)
+    
+            eRe_d     = cp.asarray(eRe)
+            eIm_d     = cp.asarray(eIm)
+            
+            if self.param.Efreq == 0:
+                ef_t = lambda t : cp.sqrt(eRe_d**2 + eIm_d**2)
+            else:
+                ef_t = lambda t : eRe_d * cp.cos(2 * cp.pi * self.param.Efreq * t) + eIm_d * cp.sin(2 * cp.pi * self.param.Efreq * t)
+            
+            self.bte_f0 = cp.asarray(self.bte_f0)
+                
+        self.bte_solver.set_efield_function(ef_t)
+        return        
 
     def solve(self):
-        pass
+        grid_idx = self.param.grid_idx
+        ff , qoi = self.bte_solver.solve(grid_idx, self.bte_f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+        ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[0][1], 500)
+        ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
+
+        if self.param.use_gpu==1:
+            self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
+
+        ff_r     = cp.asnumpy(ff_r)
+        for k, v in qoi.items():
+            qoi[k] = cp.asnumpy(v)
+
+        csv_write = self.param.export_csv
+        if csv_write:
+            fname = self.param.out_fname
+            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                    
+                writer.writerow(header)
+                
+                n0 = self.bte_solver._par_bte_params[grid_idx]["n0"]
+                ne = self.bte_solver._par_bte_params[grid_idx]["ne"]
+                ni = self.bte_solver._par_bte_params[grid_idx]["ni"]
+                Tg = self.bte_solver._par_bte_params[grid_idx]["Tg"]
+                
+                eRe = self.bte_solver._par_ef_t(0)
+                eIm = 0 * self.bte_solver._par_ef_t(0)
+                
+                if self.param.use_gpu==1:
+                    eRe = cp.asnumpy(eRe)
+                    eIm = cp.asnumpy(eIm)
+                
+                eMag  = np.sqrt(eRe**2 + eIm**2)
+                data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
+                for col_idx, g in enumerate(self.param.collisions):
+                    data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+                
+                writer.writerows(data)
+
 
+        plot_data    = self.param.plot_data
+        if plot_data:
+            num_sh       = len(self.bte_solver._par_lm[grid_idx])
+            num_subplots = num_sh 
+            num_plt_cols = min(num_sh, 4)
+            num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+            fig        = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+            plt_idx    =  1
+            n_pts_step =  self.param.n_pts // 20
+
+            for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                for ii in range(0, self.param.n_pts, n_pts_step):
+                    fr = np.abs(ff_r[ii, lm_idx, :])
+                    plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                
+                plt.xlabel(r"energy (eV)")
+                plt.ylabel(r"$f_%d$"%(lm[0]))
+                plt.grid(visible=True)
+                if lm_idx==0:
+                    plt.legend(prop={'size': 6})
+                    
+                plt_idx +=1
+            
+            #plt_idx = num_sh
+            plt.savefig("%s_plot.png"%(self.param.out_fname))
+        
+        
     def push(self, interface):
-        electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        electron_temperature[:] = 1.
+        pass
+        #electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
+        #electron_temperature[:] = 1.
+
 
 
 
-# set path to C++ TPS library
-path = os.path.abspath(os.path.dirname(sys.argv[0]))
-sys.path.append(path + "/.libs")
-import libtps
 
 comm = MPI.COMM_WORLD
 # TPS solver
@@ -42,7 +250,7 @@ def push(self, interface):
 tps.chooseSolver()
 tps.initialize()
 
-boltzmann = BoltzmannMockSolver()
+boltzmann = Boltzmann0D2VBactchedSolver(tps)
 
 interface = libtps.Tps2Boltzmann(tps)
 tps.initInterface(interface)
@@ -51,17 +259,21 @@ def push(self, interface):
 max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
 print("Max Iters: ", max_iters)
 tps.solveBegin()
+tps.solveStep()
+tps.push(interface)
+boltzmann.fetch(interface)
+boltzmann.solve()
 
-while it < max_iters:
-    tps.solveStep()
-    tps.push(interface)
-    boltzmann.fetch(interface)
-    boltzmann.solve()
-    boltzmann.push(interface)
-    tps.fetch(interface)
+# while it < max_iters:
+#     tps.solveStep()
+#     tps.push(interface)
+#     boltzmann.fetch(interface)
+#     boltzmann.solve()
+#     boltzmann.push(interface)
+#     tps.fetch(interface)
     
-    it = it+1
-    print("it, ", it)
+#     it = it+1
+#     print("it, ", it)
 
 tps.solveEnd()
 

From e3eb1120bf032cfb607582dbef79c2fc101022ee Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Sat, 21 Oct 2023 00:48:48 -0500
Subject: [PATCH 02/75] initial dev. of spatally adapted v-grids

---
 src/tps-time-loop.py | 308 +++++++++++++++++++++++++------------------
 1 file changed, 177 insertions(+), 131 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 51f47379c..47dba0e17 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -21,14 +21,14 @@ class BoltzmannSolverParams():
     Nr            = 127         # number of B-splines used in radial direction
     l_max         = 1           # spherical modes uses, 0, to l_max
     ev_max        = 16          # v-space grid truncation (eV)
-    n_grids       = 1           # number of v-space grids
+    n_grids       = 4           # number of v-space grids
 
     dt            = 1e-2        # [] non-dimentionalized time w.r.t. oscilation period
     cycles        = 10          # number of max cycles to evolve
     solver_type   = "transient" # two modes, "transient" or "steady-state"
     atol          = 1e-16       # absolute tolerance
     rtol          = 1e-12       # relative tolerance
-    max_iter      = 1000         # max iterations for the newton solver
+    max_iter      = 1000        # max iterations for the newton solver
 
     ee_collisions = 0           # enable electron-electron Coulombic effects
     use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
@@ -40,7 +40,6 @@ class BoltzmannSolverParams():
     
     Efreq         = 0.0 #[1/s]  # E-field osicllation frequency
     verbose       = 1           # verbose output for the BTE solver
-    n_pts         = 10          # number of spatial points to launch the BTE solver
     Te            = 0.5 #[eV]   # approximate electron temperature
     
     threads       = 16          # number of threads to use to assemble operators
@@ -52,6 +51,9 @@ class BoltzmannSolverParams():
     # some useful units and conversion factors. 
     ev_to_K       = (scipy.constants.electron_volt/scipy.constants.Boltzmann) 
     Td_fac        = 1e-21 #[Vm^2]
+    c_gamma       = np.sqrt(2 * scipy.constants.elementary_charge / scipy.constants.electron_mass) #[(C/kg)^{1/2}]
+    me            = scipy.constants.electron_mass
+    kB            = scipy.constants.Boltzmann
     
 class TPSINDEX():
     """
@@ -65,170 +67,213 @@ class TPSINDEX():
     EF_IM_IDX = 1                       # Im(E) index
     
 class Boltzmann0D2VBactchedSolver:
+    
     def __init__(self, tps):
         self.tps   = tps
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
         self.param.Efreq = 0#tps.getRequiredInput("em/current_frequency")
         self.param.solver_type = "steady-state"
-        #self.param.n_pts       = 10
-        
-        lm_modes        = [[[l,0] for l in range(self.param.l_max+1)]]
-        nr              = np.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
-        
-        Te              = np.ones(self.param.n_grids) * self.param.Te 
-        ev_max          = np.ones(self.param.n_grids) * self.param.ev_max
-        self.bte_solver = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
         
-        # compute BTE operators
-        grid_idx        = self.param.grid_idx
-        self.bte_solver.assemble_operators(grid_idx)
+        self.xp_module          = np
+    
+    def grid_setup(self, interface):
+        xp                = self.xp_module
+        Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
+        Te_min, Te_max    = xp.min(Te), xp.max(Te)
+        Te_b              = xp.linspace(Te_min, Te_max + 1e-12, self.param.n_grids + 1)
         
-    def fetch(self, interface):
-        grid_idx          = self.param.grid_idx
-        Tg                = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=True)
-        tps_npts          = len(Tg)
+        grid_idx_to_spatial_pts_map = list()
+        for b_idx in range(self.param.n_grids):
+            grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
         
-        Te                = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=True)
-        rr                = np.array(interface.HostRead(libtps.t2bIndex.ReactionRates), copy=True)
-        efield            = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=True).reshape((2, tps_npts))
-        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=True).reshape(3, tps_npts)
+        self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
+        self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
         
-        bte_idx           = Te > (0.4 * self.param.ev_to_K)
-        self.param.n_pts  = len(Te[bte_idx])
+        xp.sum(self.grid_idx_to_npts) == len(Te), "[Error] : TPS spatial points for v-space grid assignment is inconsitant"
+        lm_modes                         = [[[l,0] for l in range(self.param.l_max+1)] for grid_idx in range(self.param.n_grids)]
+        nr                               = xp.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
+        Te                               = xp.array([Te_b[b_idx]  for b_idx in range(self.param.n_grids)]) # xp.ones(self.param.n_grids) * self.param.Te 
+        vth                              = np.sqrt(2* self.param.kB * Te * self.param.ev_to_K  /self.param.me)
+        ev_max                           = (6 * vth / self.param.c_gamma)**2 
+        self.bte_solver                  = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
+
+        if self.param.verbose==1:
+            print("grid energy max (eV) \n", ev_max)
         
-        ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
-        ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
-        n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
-        Tg                = Tg[bte_idx]
-        Te                = Te[bte_idx]
+        # compute BTE operators
+        for grid_idx in range(self.param.n_grids):
+            print("setting up grid %d"%(grid_idx))
+            self.bte_solver.assemble_operators(grid_idx)
+            
+        return
         
-        ne[ne<0]          = 1e-16
-        ni[ni<0]          = 1e-16
+    def fetch(self, interface):
+        xp                = self.xp_module
+        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
         
-        eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
-        eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
-        eMag              = np.sqrt(eRe**2 + eIm **2)
-        eByn0             = eMag/n0/self.param.Td_fac
+        heavy_temp        = xp.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts          = len(heavy_temp)
         
+        electron_temp     = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        efield            = xp.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
+        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
         
-        if self.param.verbose == 1 :
-            print("Boltzmann Solver Inputs")
-            print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-            print("n_pts = %d" % self.param.n_pts)
-            # idx0 = np.argmin(eByn0)
-            # idx1 = np.argmax(eByn0)
-            # print("E/n0  (min)               = %.12E [Td]     \t E/n0 (max) = %.12E [Td]    "%(eByn0[idx0], eByn0[idx1]))
-            # print("at E/n0 min max, Tg       = %.12E [K]      \t Tg         = %.12E [K]     "%(Tg[idx0], Tg[idx1]))
-            # print("at E/n0 min max, Te       = %.12E [K]      \t Te         = %.12E [K]     "%(Te[idx0], Te[idx1]))
+        for grid_idx in range(self.param.n_grids):
+            bte_idx           = gidx_to_pidx_map[grid_idx]
+            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
+            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
+            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
+            Tg                = heavy_temp[bte_idx]
+            Te                = electron_temp[bte_idx]
             
-            # print("at E/n0 min max, ne       = %.12E [1/m^3]  \t ne         = %.12E [1/m^3] "%(ne[idx0], ne[idx1]))
-            # print("at E/n0 min max, ni       = %.12E [1/m^3]  \t ni         = %.12E [1/m^3] "%(ni[idx0], ni[idx1]))
-            # print("at E/n0 min max, n0       = %.12E [1/m^3]  \t n0         = %.12E [1/m^3] "%(n0[idx0], n0[idx1]))
-            print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-            print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
-            print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
-            
-            print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
-            print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
-            print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
             
+            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
+            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
+            eMag              = np.sqrt(eRe**2 + eIm **2)
+            eByn0             = eMag/n0/self.param.Td_fac
         
-        self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
-        self.bte_f0    = self.bte_solver.initialize(0, self.param.n_pts, "maxwellian")
+            if self.param.verbose == 1 :
+                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                
+                # idx0 = np.argmin(eByn0)
+                # idx1 = np.argmax(eByn0)
+                # print("E/n0  (min)               = %.12E [Td]     \t E/n0 (max) = %.12E [Td]    "%(eByn0[idx0], eByn0[idx1]))
+                # print("at E/n0 min max, Tg       = %.12E [K]      \t Tg         = %.12E [K]     "%(Tg[idx0], Tg[idx1]))
+                # print("at E/n0 min max, Te       = %.12E [K]      \t Te         = %.12E [K]     "%(Te[idx0], Te[idx1]))
+                
+                # print("at E/n0 min max, ne       = %.12E [1/m^3]  \t ne         = %.12E [1/m^3] "%(ne[idx0], ne[idx1]))
+                # print("at E/n0 min max, ni       = %.12E [1/m^3]  \t ni         = %.12E [1/m^3] "%(ni[idx0], ni[idx1]))
+                # print("at E/n0 min max, n0       = %.12E [1/m^3]  \t n0         = %.12E [1/m^3] "%(n0[idx0], n0[idx1]))
+                
+                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
+                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
+                
+                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
+                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
+                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
+            
+            #self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0", n0)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne", ne)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni", ni)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg", Tg)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eRe)
+            
         if self.param.Efreq == 0:
             ef_t = lambda t : eMag
         else:
             ef_t = lambda t : eRe * np.cos(2 * np.pi * self.param.Efreq * t) + eIm * np.sin(2 * np.pi * self.param.Efreq * t)
 
-        if self.param.use_gpu==1:
-            dev_id   = self.param.dev_id
-            self.bte_solver.host_to_device_setup(dev_id, 0)
-    
-            eRe_d     = cp.asarray(eRe)
-            eIm_d     = cp.asarray(eIm)
-            
-            if self.param.Efreq == 0:
-                ef_t = lambda t : cp.sqrt(eRe_d**2 + eIm_d**2)
-            else:
-                ef_t = lambda t : eRe_d * cp.cos(2 * cp.pi * self.param.Efreq * t) + eIm_d * cp.sin(2 * cp.pi * self.param.Efreq * t)
-            
-            self.bte_f0 = cp.asarray(self.bte_f0)
-                
         self.bte_solver.set_efield_function(ef_t)
         return        
 
     def solve(self):
-        grid_idx = self.param.grid_idx
-        ff , qoi = self.bte_solver.solve(grid_idx, self.bte_f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-        ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[0][1], 500)
-        ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
-
-        if self.param.use_gpu==1:
-            self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
-
-        ff_r     = cp.asnumpy(ff_r)
-        for k, v in qoi.items():
-            qoi[k] = cp.asnumpy(v)
-
-        csv_write = self.param.export_csv
-        if csv_write:
-            fname = self.param.out_fname
-            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                writer = csv.writer(f,delimiter=',')
-                # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
-                    
-                writer.writerow(header)
+        xp = self.xp_module
+        for grid_idx in range(self.param.n_grids):
+            
+            if self.grid_idx_to_npts[grid_idx] ==0:
+                continue
+            
+            if self.param.verbose==1:
+                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]))
+                f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+            
+            if self.param.use_gpu==1:
+                dev_id   = self.param.dev_id
+                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
                 
-                n0 = self.bte_solver._par_bte_params[grid_idx]["n0"]
-                ne = self.bte_solver._par_bte_params[grid_idx]["ne"]
-                ni = self.bte_solver._par_bte_params[grid_idx]["ni"]
-                Tg = self.bte_solver._par_bte_params[grid_idx]["Tg"]
+                with cp.cuda.Device(dev_id):
+                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+            
+                    if self.param.Efreq == 0:
+                        ef_t = lambda t : cp.sqrt(eRe_d**2 + eIm_d**2)
+                    else:
+                        ef_t = lambda t : eRe_d * cp.cos(2 * cp.pi * self.param.Efreq * t) + eIm_d * cp.sin(2 * cp.pi * self.param.Efreq * t)
+                        
+                self.bte_solver.set_efield_function(ef_t)            
                 
-                eRe = self.bte_solver._par_ef_t(0)
-                eIm = 0 * self.bte_solver._par_ef_t(0)
+            f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+            
+            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
+
+            if self.param.use_gpu==1:
+                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
                 
-                if self.param.use_gpu==1:
-                    eRe = cp.asnumpy(eRe)
-                    eIm = cp.asnumpy(eIm)
+            with cp.cuda.Device(dev_id):
+                ff_r     = cp.asnumpy(ff_r)
+                for k, v in qoi.items():
+                    qoi[k] = cp.asnumpy(v)
+
+            csv_write = self.param.export_csv
+            if csv_write:
+                fname    = self.param.out_fname
+                csv_mode = 'a'
                 
-                eMag  = np.sqrt(eRe**2 + eIm**2)
-                data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
-                for col_idx, g in enumerate(self.param.collisions):
-                    data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+                if grid_idx == 0:
+                    csv_mode = 'w'
                 
-                writer.writerows(data)
+                with open("%s_qoi.csv"%fname, csv_mode, encoding='UTF8') as f:
+                    writer = csv.writer(f,delimiter=',')
+                    # write the header
+                    header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                    for col_idx, g in enumerate(self.param.collisions):
+                        header.append(str(g))
+                    
+                    if grid_idx ==0:                        
+                        writer.writerow(header)
+                    
+                    n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                    ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                    ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                    Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                    
+                    eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                    eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                    eMag  = np.sqrt(eRe**2 + eIm**2)
+                    
+                    data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
+                    for col_idx, g in enumerate(self.param.collisions):
+                        data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+                    
+                    writer.writerows(data)
 
 
-        plot_data    = self.param.plot_data
-        if plot_data:
-            num_sh       = len(self.bte_solver._par_lm[grid_idx])
-            num_subplots = num_sh 
-            num_plt_cols = min(num_sh, 4)
-            num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-            fig        = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-            plt_idx    =  1
-            n_pts_step =  self.param.n_pts // 20
+            plot_data    = self.param.plot_data
+            if plot_data:
+                num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                num_subplots = num_sh 
+                num_plt_cols = min(num_sh, 4)
+                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                plt_idx      =  1
+                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
 
-            for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                for ii in range(0, self.param.n_pts, n_pts_step):
-                    fr = np.abs(ff_r[ii, lm_idx, :])
-                    plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
-                
-                plt.xlabel(r"energy (eV)")
-                plt.ylabel(r"$f_%d$"%(lm[0]))
-                plt.grid(visible=True)
-                if lm_idx==0:
-                    plt.legend(prop={'size': 6})
+                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                        fr = np.abs(ff_r[ii, lm_idx, :])
+                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
                     
-                plt_idx +=1
-            
-            #plt_idx = num_sh
-            plt.savefig("%s_plot.png"%(self.param.out_fname))
+                    plt.xlabel(r"energy (eV)")
+                    plt.ylabel(r"$f_%d$"%(lm[0]))
+                    plt.grid(visible=True)
+                    if lm_idx==0:
+                        plt.legend(prop={'size': 6})
+                        
+                    plt_idx +=1
+                
+                #plt_idx = num_sh
+                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                plt.close()
         
         
     def push(self, interface):
@@ -261,6 +306,7 @@ def push(self, interface):
 tps.solveBegin()
 tps.solveStep()
 tps.push(interface)
+boltzmann.grid_setup(interface)
 boltzmann.fetch(interface)
 boltzmann.solve()
 

From 6ed4cc0a87657e932a1a61d125fb4c070fa836f9 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Sun, 22 Oct 2023 08:57:37 -0500
Subject: [PATCH 03/75] multiple grids batched v-space solver

---
 src/tps-time-loop.py | 124 ++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 44 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 47dba0e17..27bbb8fed 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -23,11 +23,11 @@ class BoltzmannSolverParams():
     ev_max        = 16          # v-space grid truncation (eV)
     n_grids       = 4           # number of v-space grids
 
-    dt            = 1e-2        # [] non-dimentionalized time w.r.t. oscilation period
-    cycles        = 10          # number of max cycles to evolve
+    dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
+    cycles        = 3           # number of max cycles to evolve
     solver_type   = "transient" # two modes, "transient" or "steady-state"
-    atol          = 1e-16       # absolute tolerance
-    rtol          = 1e-12       # relative tolerance
+    atol          = 1e-10       # absolute tolerance
+    rtol          = 1e-10       # relative tolerance
     max_iter      = 1000        # max iterations for the newton solver
 
     ee_collisions = 0           # enable electron-electron Coulombic effects
@@ -72,8 +72,8 @@ def __init__(self, tps):
         self.tps   = tps
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
-        self.param.Efreq = 0#tps.getRequiredInput("em/current_frequency")
-        self.param.solver_type = "steady-state"
+        self.param.Efreq = tps.getRequiredInput("em/current_frequency")
+        #self.param.solver_type = "steady-state"
         
         self.xp_module          = np
     
@@ -81,11 +81,20 @@ def grid_setup(self, interface):
         xp                = self.xp_module
         Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         Te_min, Te_max    = xp.min(Te), xp.max(Te)
-        Te_b              = xp.linspace(Te_min, Te_max + 1e-12, self.param.n_grids + 1)
+        Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
+        
+        dist_mat          = xp.zeros((len(Te), self.param.n_grids))
+        
+        for i in range(self.param.n_grids):
+            dist_mat[:,i] = xp.abs(Te-Te_b[i])
+        
+        membership = xp.argmin(dist_mat, axis=1)
+        
         
         grid_idx_to_spatial_pts_map = list()
         for b_idx in range(self.param.n_grids):
-            grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
+            #grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
+            grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
         
         self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
         self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
@@ -114,6 +123,7 @@ def fetch(self, interface):
         
         heavy_temp        = xp.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
         tps_npts          = len(heavy_temp)
+        self.tps_npts     = tps_npts
         
         electron_temp     = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
         efield            = xp.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
@@ -173,9 +183,14 @@ def fetch(self, interface):
         return        
 
     def solve(self):
-        xp = self.xp_module
-        for grid_idx in range(self.param.n_grids):
+        xp               = self.xp_module
+        csv_write        = self.param.export_csv
+        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        
+        if csv_write ==1 : 
+            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
             
+        for grid_idx in range(self.param.n_grids):
             if self.grid_idx_to_npts[grid_idx] ==0:
                 continue
             
@@ -200,8 +215,12 @@ def solve(self):
                 self.bte_solver.set_efield_function(ef_t)            
                 
             f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-            
+            try:
+                ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+            except:
+                print("solver failed for v-space gird no %d"%(grid_idx))
+                continue
+                
             ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
             ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
 
@@ -212,43 +231,35 @@ def solve(self):
                 ff_r     = cp.asnumpy(ff_r)
                 for k, v in qoi.items():
                     qoi[k] = cp.asnumpy(v)
-
-            csv_write = self.param.export_csv
-            if csv_write:
-                fname    = self.param.out_fname
-                csv_mode = 'a'
-                
-                if grid_idx == 0:
-                    csv_mode = 'w'
-                
-                with open("%s_qoi.csv"%fname, csv_mode, encoding='UTF8') as f:
-                    writer = csv.writer(f,delimiter=',')
-                    # write the header
-                    header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                    for col_idx, g in enumerate(self.param.collisions):
-                        header.append(str(g))
-                    
-                    if grid_idx ==0:                        
-                        writer.writerow(header)
                     
-                    n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                    ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                    ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                    Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                    
-                    eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                    eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-                    eMag  = np.sqrt(eRe**2 + eIm**2)
-                    
-                    data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
-                    for col_idx, g in enumerate(self.param.collisions):
-                        data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+            if csv_write==1:
+                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
+                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
+                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
+                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
+                
+                for col_idx, g in enumerate(self.param.collisions):
+                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
                     
-                    writer.writerows(data)
-
+                
+                
 
             plot_data    = self.param.plot_data
             if plot_data:
+                
+                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                eMag  = np.sqrt(eRe**2 + eIm**2)
+                
                 num_sh       = len(self.bte_solver._par_lm[grid_idx])
                 num_subplots = num_sh 
                 num_plt_cols = min(num_sh, 4)
@@ -276,6 +287,31 @@ def solve(self):
                 plt.close()
         
         
+        if csv_write:
+            fname    = self.param.out_fname
+            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                
+                writer.writerow(header)
+                # n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                # ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                # ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                # Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                
+                # eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                # eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                # eMag  = np.sqrt(eRe**2 + eIm**2)
+                
+                # data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
+                # for col_idx, g in enumerate(self.param.collisions):
+                #     data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+                
+                writer.writerows(data_csv)
+        
     def push(self, interface):
         pass
         #electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)

From 7a2d75bf41cefa8aa961778b27b42748d202c778 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Sun, 22 Oct 2023 17:53:28 -0500
Subject: [PATCH 04/75] multiple v-space grids added, and boltzmann to tps push
 code added.

---
 src/tps-time-loop.py | 84 ++++++++++++++++++++++++++++++++------------
 1 file changed, 62 insertions(+), 22 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 27bbb8fed..9094fa6c5 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -6,6 +6,7 @@
 import scipy.constants
 import csv
 import matplotlib.pyplot as plt
+from time import perf_counter as time
 
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
@@ -24,7 +25,7 @@ class BoltzmannSolverParams():
     n_grids       = 4           # number of v-space grids
 
     dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
-    cycles        = 3           # number of max cycles to evolve
+    cycles        = 10             # number of max cycles to evolve
     solver_type   = "transient" # two modes, "transient" or "steady-state"
     atol          = 1e-10       # absolute tolerance
     rtol          = 1e-10       # relative tolerance
@@ -45,7 +46,7 @@ class BoltzmannSolverParams():
     threads       = 16          # number of threads to use to assemble operators
     grid_idx      = 0
     
-    output_dir    = "batched_bte"
+    output_dir    = "batched_bte1"
     out_fname     = output_dir + "/tps"
     
     # some useful units and conversion factors. 
@@ -72,30 +73,52 @@ def __init__(self, tps):
         self.tps   = tps
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
-        self.param.Efreq = tps.getRequiredInput("em/current_frequency")
-        #self.param.solver_type = "steady-state"
+        self.param.Efreq = 0 #tps.getRequiredInput("em/current_frequency")
+        self.param.solver_type = "steady-state"
         
         self.xp_module          = np
+        
+        boltzmann_dir           = self.param.output_dir
+        isExist = os.path.exists(boltzmann_dir)
+        if not isExist:
+           # Create a new directory because it does not exist
+           os.makedirs(boltzmann_dir)
+           #print("directory %s is created!"%(dir_name))
+        return
+    
+    def parse_config_file(self):
+        """
+        add the configuaraion file parse code here, 
+        which overides the default BoltzmannSolverParams
+        """
+        pass
     
     def grid_setup(self, interface):
+        """
+        Perform the boltzmann grid setup. 
+        we generate v-space grid for each spatial point cluster in the parameter space, 
+        where, at the moment the clustering is determined based on the electron temperature
+        computed from the TPS code. 
+        """
         xp                = self.xp_module
         Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         Te_min, Te_max    = xp.min(Te), xp.max(Te)
         Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
         
+        t1                = time()
         dist_mat          = xp.zeros((len(Te), self.param.n_grids))
         
         for i in range(self.param.n_grids):
             dist_mat[:,i] = xp.abs(Te-Te_b[i])
         
         membership = xp.argmin(dist_mat, axis=1)
-        
-        
         grid_idx_to_spatial_pts_map = list()
         for b_idx in range(self.param.n_grids):
             #grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
             grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
         
+        np.save("%s_gidx_to_pidx.npy"%(self.param.out_fname), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
+        
         self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
         self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
         
@@ -114,7 +137,9 @@ def grid_setup(self, interface):
         for grid_idx in range(self.param.n_grids):
             print("setting up grid %d"%(grid_idx))
             self.bte_solver.assemble_operators(grid_idx)
-            
+        
+        t2=time()
+        print("time for boltzmann grid setup = %.4E"%(t2-t1))
         return
         
     def fetch(self, interface):
@@ -148,16 +173,6 @@ def fetch(self, interface):
                 print("Efreq = %.4E [1/s]" %(self.param.Efreq))
                 print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                 
-                # idx0 = np.argmin(eByn0)
-                # idx1 = np.argmax(eByn0)
-                # print("E/n0  (min)               = %.12E [Td]     \t E/n0 (max) = %.12E [Td]    "%(eByn0[idx0], eByn0[idx1]))
-                # print("at E/n0 min max, Tg       = %.12E [K]      \t Tg         = %.12E [K]     "%(Tg[idx0], Tg[idx1]))
-                # print("at E/n0 min max, Te       = %.12E [K]      \t Te         = %.12E [K]     "%(Te[idx0], Te[idx1]))
-                
-                # print("at E/n0 min max, ne       = %.12E [1/m^3]  \t ne         = %.12E [1/m^3] "%(ne[idx0], ne[idx1]))
-                # print("at E/n0 min max, ni       = %.12E [1/m^3]  \t ni         = %.12E [1/m^3] "%(ni[idx0], ni[idx1]))
-                # print("at E/n0 min max, n0       = %.12E [1/m^3]  \t n0         = %.12E [1/m^3] "%(n0[idx0], n0[idx1]))
-                
                 print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
                 print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
                 print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
@@ -183,13 +198,19 @@ def fetch(self, interface):
         return        
 
     def solve(self):
+        """
+        perform the BTE solve, supports both stead-state solution (static E-field) 
+        and time-periodic solutions for the oscillatory E-fields
+        """
         xp               = self.xp_module
         csv_write        = self.param.export_csv
         gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
         
         if csv_write ==1 : 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
-            
+        
+        t1 = time()
+        self.qoi = list()            
         for grid_idx in range(self.param.n_grids):
             if self.grid_idx_to_npts[grid_idx] ==0:
                 continue
@@ -217,9 +238,12 @@ def solve(self):
             f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
             try:
                 ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                self.qoi.append(qoi)
             except:
                 print("solver failed for v-space gird no %d"%(grid_idx))
-                continue
+                # self.qoi.append(None)
+                # continue
+                sys.exit(0)
                 
             ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
             ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
@@ -286,6 +310,8 @@ def solve(self):
                 plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
                 plt.close()
         
+        t2 = time()
+        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
         
         if csv_write:
             fname    = self.param.out_fname
@@ -313,10 +339,23 @@ def solve(self):
                 writer.writerows(data_csv)
         
     def push(self, interface):
-        pass
-        #electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        #electron_temperature[:] = 1.
+        Te               = np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
+        rate_coeff       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
+        
+        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        
+        for grid_idx in range(self.param.n_grids):
+            Te[gidx_to_pidx_map[grid_idx]]            = self.qoi[grid_idx]["energy"]/1.5
+            rr                                        = self.qoi[grid_idx]["rates"]
+            # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
+            
+            rate_coeff[0][gidx_to_pidx_map[grid_idx]] = rr[0]
+            rate_coeff[1][gidx_to_pidx_map[grid_idx]] = rr[1]
 
+        rate_coeff[1][rate_coeff[1]<0] = 0.0
+            
+        return 
+        
 
 
 
@@ -345,6 +384,7 @@ def push(self, interface):
 boltzmann.grid_setup(interface)
 boltzmann.fetch(interface)
 boltzmann.solve()
+boltzmann.push(interface)
 
 # while it < max_iters:
 #     tps.solveStep()

From 01e51bfba0533c32761e2bd74a4ee3ce8e3cc052 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 25 Oct 2023 10:44:13 -0500
Subject: [PATCH 05/75] tps bte batched solver integration with Parla.

---
 src/tps-time-loop.py | 358 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 315 insertions(+), 43 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 9094fa6c5..4b04c0478 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -7,6 +7,47 @@
 import csv
 import matplotlib.pyplot as plt
 from time import perf_counter as time
+import configparser
+import cupy as cp
+import enum
+
+class profile_t:
+    def __init__(self,name):
+        self.name = name
+        self.seconds=0
+        self.snap=0
+        self._pri_time =0
+        self.iter =0
+
+    def __add__(self,o):
+        assert(self.name==o.name)
+        self.seconds+=o.seconds
+        self.snap+=o.snap
+        self.iter+=o.iter
+        return self
+
+    def start(self):
+        self._pri_time = time()
+    
+    def stop(self):
+        self.seconds-=self._pri_time
+        self.snap=-self._pri_time
+
+        self._pri_time = time()
+
+        self.seconds +=self._pri_time
+        self.snap  += self._pri_time
+        self.iter+=1
+    
+    def reset(self):
+        self.seconds=0
+        self.snap=0
+        self._pri_time =0
+        self.iter =0
+
+def min_mean_max(a, comm: MPI.Comm):
+    return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
+
 
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
@@ -14,11 +55,26 @@
 sys.path.append(path + "/../../boltzmann/BESolver/python")
 import libtps
 from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
-import cupy as cp
+
+WITH_PARLA = 1
+if WITH_PARLA:
+    try:
+        from parla import Parla
+        from parla.tasks import spawn, TaskSpace
+        from parla.devices import cpu, gpu
+    except:
+        print("Error occured during Parla import. Please make sure Parla is installed properly.")
+        sys.exit(0)
+
+
+class pp(enum.IntEnum):
+    SETUP         = 0
+    SOLVE         = 1
+    LAST          = 2
 
 class BoltzmannSolverParams():
-    sp_order      = 8           # B-spline order in v-space
-    spline_qpts   = 10          # number of Gauss-Legendre quadrature points per knot interval    
+    sp_order      = 3           # B-spline order in v-space
+    spline_qpts   = 5           # number of Gauss-Legendre quadrature points per knot interval    
     Nr            = 127         # number of B-splines used in radial direction
     l_max         = 1           # spherical modes uses, 0, to l_max
     ev_max        = 16          # v-space grid truncation (eV)
@@ -69,12 +125,12 @@ class TPSINDEX():
     
 class Boltzmann0D2VBactchedSolver:
     
-    def __init__(self, tps):
+    def __init__(self, tps, comm):
         self.tps   = tps
+        self.comm : MPI.Comm  = comm
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
-        self.param.Efreq = 0 #tps.getRequiredInput("em/current_frequency")
-        self.param.solver_type = "steady-state"
+        self.parse_config_file(sys.argv[2])
         
         self.xp_module          = np
         
@@ -84,14 +140,52 @@ def __init__(self, tps):
            # Create a new directory because it does not exist
            os.makedirs(boltzmann_dir)
            #print("directory %s is created!"%(dir_name))
+           
+        profile_tt  = [None] * int(pp.LAST)
+        profile_nn  = ["setup", "solve", "last"]
+        for i in range(pp.LAST):
+            profile_tt[i] = profile_t(profile_nn[i])
+        
+        self.profile_tt = profile_tt
+        self.profile_nn = profile_nn
+
         return
     
-    def parse_config_file(self):
+    def parse_config_file(self, fname):
         """
         add the configuaraion file parse code here, 
         which overides the default BoltzmannSolverParams
         """
-        pass
+        config = configparser.ConfigParser()
+        print("[Boltzmann] reading configure file given by : ", fname)
+        config.read(fname)
+        
+        self.param.sp_order         = int(config.get("boltzmannSolver", "sp_order").split("#")[0].strip())
+        self.param.spline_qpts      = int(config.get("boltzmannSolver", "spline_qpts").split("#")[0].strip())
+        
+        self.param.Nr               = int(config.get("boltzmannSolver", "Nr").split("#")[0].strip())
+        self.param.l_max            = int(config.get("boltzmannSolver", "l_max").split("#")[0].strip())
+        self.param.n_grids          = int(config.get("boltzmannSolver", "n_grids").split("#")[0].strip())
+        self.param.dt               = float(config.get("boltzmannSolver", "dt").split("#")[0].strip())
+        self.param.cycles           = float(config.get("boltzmannSolver", "cycles").split("#")[0].strip())
+        self.param.solver_type      = str(config.get("boltzmannSolver", "solver_type").split("#")[0].strip()) 
+        self.param.atol             = float(config.get("boltzmannSolver", "atol").split("#")[0].strip())
+        self.param.rtol             = float(config.get("boltzmannSolver", "rtol").split("#")[0].strip())
+        self.param.max_iter         = int(config.get("boltzmannSolver", "max_iter").split("#")[0].strip())
+        self.param.ee_collisions    = int(config.get("boltzmannSolver", "ee_collisions").split("#")[0].strip())
+        self.param.use_gpu          = int(config.get("boltzmannSolver", "use_gpu").split("#")[0].strip())
+        #self.param.collisions       = config.get("boltzmannSolver", "collisions").split("#")[0]
+        
+        self.param.export_csv       = int(config.get("boltzmannSolver", "export_csv").split("#")[0].strip())
+        self.param.plot_data        = int(config.get("boltzmannSolver", "plot_data").split("#")[0].strip())
+        self.param.Efreq            = float(config.get("boltzmannSolver", "Efreq").split("#")[0].strip())
+        self.param.verbose          = int(config.get("boltzmannSolver", "verbose").split("#")[0].strip())
+        self.param.Te               = float(config.get("boltzmannSolver", "Te").split("#")[0].strip())
+
+        self.param.threads          = int(config.get("boltzmannSolver", "threads").split("#")[0].strip())
+        self.param.output_dir       = str(config.get("boltzmannSolver", "output_dir").split("#")[0].strip())
+        self.param.out_fname        = self.param.output_dir + "/" + str(config.get("boltzmannSolver", "output_fname").split("#")[0].strip())
+        return 
     
     def grid_setup(self, interface):
         """
@@ -100,12 +194,14 @@ def grid_setup(self, interface):
         where, at the moment the clustering is determined based on the electron temperature
         computed from the TPS code. 
         """
+        
+        self.profile_tt[pp.SETUP].start()
+        
         xp                = self.xp_module
         Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         Te_min, Te_max    = xp.min(Te), xp.max(Te)
         Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
         
-        t1                = time()
         dist_mat          = xp.zeros((len(Te), self.param.n_grids))
         
         for i in range(self.param.n_grids):
@@ -131,15 +227,14 @@ def grid_setup(self, interface):
         self.bte_solver                  = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
 
         if self.param.verbose==1:
-            print("grid energy max (eV) \n", ev_max)
+            print("grid energy max (eV) \n", ev_max, flush = True)
         
         # compute BTE operators
         for grid_idx in range(self.param.n_grids):
-            print("setting up grid %d"%(grid_idx))
+            print("setting up grid %d"%(grid_idx), flush = True)
             self.bte_solver.assemble_operators(grid_idx)
         
-        t2=time()
-        print("time for boltzmann grid setup = %.4E"%(t2-t1))
+        self.profile_tt[pp.SETUP].stop()
         return
         
     def fetch(self, interface):
@@ -189,12 +284,6 @@ def fetch(self, interface):
             self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
             self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eRe)
             
-        if self.param.Efreq == 0:
-            ef_t = lambda t : eMag
-        else:
-            ef_t = lambda t : eRe * np.cos(2 * np.pi * self.param.Efreq * t) + eIm * np.sin(2 * np.pi * self.param.Efreq * t)
-
-        self.bte_solver.set_efield_function(ef_t)
         return        
 
     def solve(self):
@@ -202,21 +291,34 @@ def solve(self):
         perform the BTE solve, supports both stead-state solution (static E-field) 
         and time-periodic solutions for the oscillatory E-fields
         """
+        
+        if WITH_PARLA==1:
+            self.solve_with_parla()
+            return
+        else:
+            self.solve_seq()
+            return
+        
+    def solve_seq(self):
         xp               = self.xp_module
         csv_write        = self.param.export_csv
         gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
         
+        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
+        self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        
         if csv_write ==1 : 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
         
         t1 = time()
-        self.qoi = list()            
+        
         for grid_idx in range(self.param.n_grids):
+            
             if self.grid_idx_to_npts[grid_idx] ==0:
                 continue
             
             if self.param.verbose==1:
-                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]))
+                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
                 f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
                 self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
             
@@ -227,24 +329,36 @@ def solve(self):
                 with cp.cuda.Device(dev_id):
                     eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
                     eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-            
+
                     if self.param.Efreq == 0:
-                        ef_t = lambda t : cp.sqrt(eRe_d**2 + eIm_d**2)
+                        ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
                     else:
-                        ef_t = lambda t : eRe_d * cp.cos(2 * cp.pi * self.param.Efreq * t) + eIm_d * cp.sin(2 * cp.pi * self.param.Efreq * t)
+                        ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
                         
-                self.bte_solver.set_efield_function(ef_t)            
-                
+            else:
+                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+            
+                if self.param.Efreq == 0:
+                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                else:
+                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                            
+            self.bte_solver.set_efield_function(grid_idx, ef_t)            
             f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
             try:
                 ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                self.qoi.append(qoi)
+                self.qoi[grid_idx] = qoi
+                self.ff [grid_idx] = ff
             except:
                 print("solver failed for v-space gird no %d"%(grid_idx))
                 # self.qoi.append(None)
                 # continue
                 sys.exit(0)
-                
+            
+            if self.param.export_csv ==0 and self.param.plot_data==0:
+                continue
+            
             ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
             ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
 
@@ -269,9 +383,6 @@ def solve(self):
                 for col_idx, g in enumerate(self.param.collisions):
                     data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
                     
-                
-                
-
             plot_data    = self.param.plot_data
             if plot_data:
                 
@@ -323,21 +434,182 @@ def solve(self):
                     header.append(str(g))
                 
                 writer.writerow(header)
-                # n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                # ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                # ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                # Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                writer.writerows(data_csv)
+
+        return
+    
+    def solve_with_parla(self):
+        csv_write        = self.param.export_csv
+        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
+        self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        
+        if csv_write ==1 : 
+            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
+        
+        
+        rank = self.comm.Get_rank()
+        npes = self.comm.Get_size()
+        
+        with Parla():
+            num_gpus         = len(gpu)
+            grid_to_device_map = lambda gidx : gidx % num_gpus
+            @spawn(placement=cpu, vcus=0)
+            async def __main__():
+                self.profile_tt[pp.SETUP].start()
+                ts_0 = TaskSpace("T")
+                for grid_idx in range(self.param.n_grids):
+                    @spawn(ts_0[grid_idx], placement=[cpu], vcus=0.0)
+                    def t0():
+                        print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
+                        f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+                        
+                        if self.param.use_gpu == 1:
+                            dev_id  = grid_to_device_map(grid_idx)
+                            self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+                            xp      = cp
+
+                            with cp.cuda.Device(dev_id):
+                                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+            
+                                if self.param.Efreq == 0:
+                                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                                else:
+                                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                        else:
+                            xp = np
+                            eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                            eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+        
+                            if self.param.Efreq == 0:
+                                ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                            else:
+                                ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                                    
+                        self.bte_solver.set_efield_function(grid_idx, ef_t)
+                        return
                 
-                # eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                # eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-                # eMag  = np.sqrt(eRe**2 + eIm**2)
+                await ts_0
                 
-                # data  = np.concatenate((n0.reshape(-1,1), ne.reshape(-1,1), ni.reshape(-1,1), Tg.reshape(-1,1), eMag.reshape(-1,1), qoi["energy"].reshape(-1,1), qoi["mobility"].reshape(-1,1), qoi["diffusion"].reshape(-1,1)), axis=1)
-                # for col_idx, g in enumerate(self.param.collisions):
-                #     data = np.concatenate((data, qoi["rates"][col_idx].reshape(-1,1)), axis=1)
+                self.profile_tt[pp.SETUP].stop()
+                if self.param.use_gpu==1:
+                    p1 = [gpu(grid_to_device_map(grid_idx)) for grid_idx in range(self.param.n_grids)]
+                else:
+                    p1 = [cpu for grid_idx in range(self.param.n_grids)]
                 
-                writer.writerows(data_csv)
+                self.profile_tt[pp.SOLVE].start()
+                ts_1 = TaskSpace("T")
+                for grid_idx in range(self.param.n_grids):
+                    @spawn(ts_1[grid_idx], placement=[p1[grid_idx]], dependencies=ts_0[grid_idx], vcus=0.0)
+                    def t1():
+                        f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, p1[grid_idx]))
+                        try:
+                            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                            self.ff[grid_idx]  = ff
+                            self.qoi[grid_idx] = qoi
+                        except:
+                            print("solver failed for v-space gird no %d"%(grid_idx))
+                            # self.qoi.append(None)
+                            # continue
+                            sys.exit(0)
+                            
+                await ts_1
+                self.profile_tt[pp.SOLVE].stop()
+        
         
+        t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
+        t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
+        print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
+        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))        
+        if self.param.export_csv ==0 and self.param.plot_data==0:
+            return
+        
+        for grid_idx in range(self.param.n_grids):
+            dev_id = grid_idx % num_gpus
+            
+            if self.param.use_gpu==1:
+                gpu_id = cp.cuda.Device(dev_id)
+                gpu_id.use()
+            
+            ff       = self.ff[grid_idx]
+            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
+
+            if self.param.use_gpu==1:
+                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
+                
+                qoi = self.qoi[grid_idx]    
+                with cp.cuda.Device(dev_id):
+                    ff_r     = cp.asnumpy(ff_r)
+                    for k, v in qoi.items():
+                        qoi[k] = cp.asnumpy(v)
+                    
+            if csv_write==1:
+                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
+                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
+                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
+                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
+                
+                for col_idx, g in enumerate(self.param.collisions):
+                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
+
+            plot_data    = self.param.plot_data
+            if plot_data:
+                
+                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                eMag  = np.sqrt(eRe**2 + eIm**2)
+                
+                num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                num_subplots = num_sh 
+                num_plt_cols = min(num_sh, 4)
+                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                plt_idx      =  1
+                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
+
+                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                        fr = np.abs(ff_r[ii, lm_idx, :])
+                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                    
+                    plt.xlabel(r"energy (eV)")
+                    plt.ylabel(r"$f_%d$"%(lm[0]))
+                    plt.grid(visible=True)
+                    if lm_idx==0:
+                        plt.legend(prop={'size': 6})
+                        
+                    plt_idx +=1
+                
+                #plt_idx = num_sh
+                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                plt.close()
+        
+        if csv_write:
+            fname    = self.param.out_fname
+            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                
+                writer.writerow(header)
+                writer.writerows(data_csv)
+       
     def push(self, interface):
         Te               = np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
         rate_coeff       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
@@ -370,7 +642,7 @@ def push(self, interface):
 tps.chooseSolver()
 tps.initialize()
 
-boltzmann = Boltzmann0D2VBactchedSolver(tps)
+boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
 
 interface = libtps.Tps2Boltzmann(tps)
 tps.initInterface(interface)

From 31ec57f6714801c06275728c8e08096aaeee8791 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 25 Oct 2023 17:23:22 -0500
Subject: [PATCH 06/75] k-means clustering updated

---
 src/tps-time-loop.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 4b04c0478..bf061fc7d 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -201,9 +201,22 @@ def grid_setup(self, interface):
         Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         Te_min, Te_max    = xp.min(Te), xp.max(Te)
         Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
-        
         dist_mat          = xp.zeros((len(Te), self.param.n_grids))
         
+        for iter in range(50):
+            #print("clustering iteration ", iter, Te_b)
+            for i in range(self.param.n_grids):
+                dist_mat[:,i] = xp.abs(Te-Te_b[i])
+            
+            membership = xp.argmin(dist_mat, axis=1)
+            Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
+            rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
+            Te_b       = Te_b1
+           
+            if rel_error < 1e-4:
+                break
+        
+        print("K-means Te clusters ", Te_b)                
         for i in range(self.param.n_grids):
             dist_mat[:,i] = xp.abs(Te-Te_b[i])
         

From 00f02dcfee5215ff31a154dba667a49cd87168fb Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 27 Oct 2023 12:11:32 -0500
Subject: [PATCH 07/75] Add spatial coordinate getter in tps2boltzamann

---
 src/tps2Boltzmann.cpp | 24 +++++++++++++++++++++++-
 src/tps2Boltzmann.hpp |  6 ++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 171313f0d..5ab4564fe 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -73,6 +73,10 @@ class CPUData {
   size_t stride_;
 };
 
+void idenity_fun(const Vector & x, Vector & out) {
+  for ( int i(0); i < x.Size(); ++i ) out[i] = x[i];
+}
+
 Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : NIndexes(7), tps_(tps), all_fes_(nullptr) {
   // Assert we have a couple solver;
   assert(tps->isFlowEMCoupled());
@@ -164,6 +168,13 @@ void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
   scalar_interpolator_->AddDomainInterpolator(new mfem::IdentityInterpolator());
   scalar_interpolator_->SetAssemblyLevel(assembly_level);
   scalar_interpolator_->Assemble();
+
+  // Spatial coordinates
+  spatial_coord_fes_ = new mfem::ParFiniteElementSpace(pmesh, fec_native, pmesh->Dimension(), mfem::Ordering::byNODES);
+  spatial_coordinates_ = new mfem::ParGridFunction(spatial_coord_fes_);
+  mfem::VectorFunctionCoefficient coord_fun(pmesh->Dimension(),
+                                            std::function<void(const Vector &, Vector &)>(idenity_fun));
+  spatial_coordinates_->ProjectCoefficient(coord_fun);
 }
 
 void Tps2Boltzmann::interpolateFromNativeFES(const ParGridFunction &input, Tps2Boltzmann::Index index) {
@@ -207,6 +218,9 @@ Tps2Boltzmann::~Tps2Boltzmann() {
   // Delete monolithic function space
   delete all_fes_;
 
+  delete spatial_coord_fes_;
+  delete spatial_coordinates_;
+
   // Delete finite element collection
   delete fec_;
 }
@@ -251,6 +265,10 @@ void tps2bolzmann(py::module &m) {
 
   py::class_<TPS::Tps2Boltzmann>(m, "Tps2Boltzmann")
       .def(py::init<TPS::Tps *>())
+      .def("HostReadSpatialCoordinates",
+           [](const TPS::Tps2Boltzmann &interface) {
+             return std::unique_ptr<TPS::CPUDataRead>(new TPS::CPUDataRead(interface.SpatialCoordinates()));
+           })
       .def("HostRead",
            [](const TPS::Tps2Boltzmann &interface, TPS::Tps2Boltzmann::Index index) {
              return std::unique_ptr<TPS::CPUDataRead>(new TPS::CPUDataRead(interface.Field(index)));
@@ -261,7 +279,11 @@ void tps2bolzmann(py::module &m) {
            })
       .def("HostReadWrite", [](TPS::Tps2Boltzmann &interface, TPS::Tps2Boltzmann::Index index) {
         return std::unique_ptr<TPS::CPUData>(new TPS::CPUData(interface.Field(index), true));
-      });
+      })
+      .def("EfieldAngularFreq", &TPS::Tps2Boltzmann::EfieldAngularFreq)
+      .def("Nspecies", &TPS::Tps2Boltzmann::Nspecies)
+      .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
+      .def("nComponents", &TPS::Tps2Boltzmann::nComponents);
 }
 }  // namespace tps_wrappers
 #endif
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index db8813458..4895c523b 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -99,6 +99,9 @@ class Tps2Boltzmann {
   const mfem::ParFiniteElementSpace &NativeFes(Index index) const { return *(list_native_fes_[index]); }
   mfem::ParFiniteElementSpace &NativeFes(Index index) { return *(list_native_fes_[index]); }
 
+  const mfem::ParGridFunction & SpatialCoordinates() const { return *spatial_coordinates_; }
+  mfem::ParGridFunction & SpatialCoordinates() { return *spatial_coordinates_; }
+
   const mfem::ParGridFunction &Field(Index index) const { return *(fields_[index]); }
   mfem::ParGridFunction &Field(Index index) { return *(fields_[index]); }
 
@@ -135,6 +138,8 @@ class Tps2Boltzmann {
   mfem::ParFiniteElementSpace *reaction_rates_fes_;
   mfem::ParFiniteElementSpace **list_fes_;
 
+  mfem::ParFiniteElementSpace *spatial_coord_fes_;
+
   //! Function spaces using the native TPS fec
   mfem::ParFiniteElementSpace *species_densities_native_fes_;
   mfem::ParFiniteElementSpace *efield_native_fes_;
@@ -147,6 +152,7 @@ class Tps2Boltzmann {
 
   //! array of fields see *Index for how to address this
   mfem::ParGridFunction **fields_;
+  mfem::ParGridFunction *spatial_coordinates_;
 
   double EfieldAngularFreq_;
 };

From 7c7f37b55e6df82657ad49a14ecf54fc5c480b5c Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 27 Oct 2023 13:16:02 -0500
Subject: [PATCH 08/75] Now the fetch method saves a Paraview file

---
 src/M2ulPhyS2Boltzmann.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index 5ba3ee4fc..e3d60bf20 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -83,4 +83,22 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
   delete electronTemperature;
 }
 
-void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) { return; }
+void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) { 
+
+  mfem::ParaViewDataCollection paraview_dc("interface", mesh);
+  paraview_dc.SetPrefixPath("BoltzmannInterface");
+  paraview_dc.SetCycle(0);
+  paraview_dc.SetDataFormat(VTKFormat::BINARY);
+  paraview_dc.SetTime(0.0);
+  paraview_dc.RegisterField("Heavy temperature",
+                            &interface.Field(TPS::Tps2Boltzmann::Index::HeavyTemperature));
+  paraview_dc.RegisterField("Electron temperature",
+                            &interface.Field(TPS::Tps2Boltzmann::Index::ElectronTemperature));
+  paraview_dc.RegisterField("Electric field",
+                            &interface.Field(TPS::Tps2Boltzmann::Index::ElectricField));
+  paraview_dc.RegisterField("Species",
+                            &interface.Field(TPS::Tps2Boltzmann::Index::SpeciesDensities));
+  paraview_dc.RegisterField("Reaction rates",
+                             &interface.Field(TPS::Tps2Boltzmann::Index::ReactionRates));
+  paraview_dc.Save();
+ }

From 32877b63211535a3bec089db9b01c0410f07ed08 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 27 Oct 2023 13:23:52 -0500
Subject: [PATCH 09/75] Fix spatial coordinates fe order

---
 src/tps2Boltzmann.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 5ab4564fe..a6b5800f7 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -170,7 +170,7 @@ void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
   scalar_interpolator_->Assemble();
 
   // Spatial coordinates
-  spatial_coord_fes_ = new mfem::ParFiniteElementSpace(pmesh, fec_native, pmesh->Dimension(), mfem::Ordering::byNODES);
+  spatial_coord_fes_ = new mfem::ParFiniteElementSpace(pmesh, fec_, pmesh->Dimension(), mfem::Ordering::byNODES);
   spatial_coordinates_ = new mfem::ParGridFunction(spatial_coord_fes_);
   mfem::VectorFunctionCoefficient coord_fun(pmesh->Dimension(),
                                             std::function<void(const Vector &, Vector &)>(idenity_fun));

From 0066290d690d7e4e01179d73a7c79bab407ef38d Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Tue, 31 Oct 2023 11:01:11 -0500
Subject: [PATCH 10/75] vtk output added.

---
 src/tps-time-loop.py | 59 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index bf061fc7d..66bae0b18 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -10,6 +10,8 @@
 import configparser
 import cupy as cp
 import enum
+import pandas as pd
+import scipy.interpolate
 
 class profile_t:
     def __init__(self,name):
@@ -49,6 +51,19 @@ def min_mean_max(a, comm: MPI.Comm):
     return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
 
 
+try:
+    df    = pd.read_csv("ionization_rates.csv")
+    Te    = np.array(df["Te[K]"]) 
+    r_arr = np.array(df["Arr[m3/s]"])
+    r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
+    r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
+    r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
+    print("ionization coefficient read from file ")
+except:
+    print("ionization rate coefficient file not found!!")
+    r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+    r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
 sys.path.append(path + "/.libs")
@@ -624,21 +639,41 @@ def t1():
                 writer.writerows(data_csv)
        
     def push(self, interface):
-        Te               = np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        rate_coeff       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
+        xp                = self.xp_module
+        Te_bte            = xp.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
+        rate_bte          = xp.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
+        Te_tps            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
         
-        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, self.tps_npts)
+        ni                = species_densities[TPSINDEX.ION_IDX]
+        n0                = species_densities[TPSINDEX.NEU_IDX]
+        ne                = species_densities[TPSINDEX.ELE_IDX]
+        
+        rate_tps_arr      = r_arr(Te_tps)
+        rate_tps_csc      = r_csc(Te_tps)
+        
+        rr_bte            = np.zeros_like(rate_tps_arr) 
+        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
         
         for grid_idx in range(self.param.n_grids):
-            Te[gidx_to_pidx_map[grid_idx]]            = self.qoi[grid_idx]["energy"]/1.5
+            Te_bte[gidx_to_pidx_map[grid_idx]]        = (self.qoi[grid_idx]["energy"]/1.5) * self.param.ev_to_K
             rr                                        = self.qoi[grid_idx]["rates"]
             # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
-            
-            rate_coeff[0][gidx_to_pidx_map[grid_idx]] = rr[0]
-            rate_coeff[1][gidx_to_pidx_map[grid_idx]] = rr[1]
-
-        rate_coeff[1][rate_coeff[1]<0] = 0.0
-            
+            rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
+        
+        rr_bte[rr_bte<0] = 0.0 
+        s0  = rate_tps_arr * n0 
+        s1  = rate_tps_csc * n0 
+        
+        s2  = rr_bte       * n0 
+        
+        tau = 1e-2
+        idx = s2 > tau
+        rate_bte[0][:]   =  0.0
+        rate_bte[1][:]   =  0.0
+        rate_bte[0]      = rr_bte
+        rate_bte[1][idx] = np.abs(1 - s1[idx]/s2[idx])
+        
         return 
         
 
@@ -660,6 +695,9 @@ def push(self, interface):
 interface = libtps.Tps2Boltzmann(tps)
 tps.initInterface(interface)
 
+coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
+print(coords.shape)
+
 it = 0
 max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
 print("Max Iters: ", max_iters)
@@ -670,6 +708,7 @@ def push(self, interface):
 boltzmann.fetch(interface)
 boltzmann.solve()
 boltzmann.push(interface)
+tps.fetch(interface)
 
 # while it < max_iters:
 #     tps.solveStep()

From a11bfa5feab64c96b3a5f849cefd2db2d44b2ead Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Tue, 31 Oct 2023 15:11:19 -0500
Subject: [PATCH 11/75] Te clusters are sorted based on emax

---
 src/tps-time-loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 66bae0b18..db1d01b3e 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -230,7 +230,7 @@ def grid_setup(self, interface):
            
             if rel_error < 1e-4:
                 break
-        
+        Te_b = np.sort(Te_b)
         print("K-means Te clusters ", Te_b)                
         for i in range(self.param.n_grids):
             dist_mat[:,i] = xp.abs(Te-Te_b[i])

From 837566f0254e2cfe2680afe3b190e77414f57f9e Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 1 Nov 2023 09:17:54 -0500
Subject: [PATCH 12/75] rel error computation updated

---
 src/tps-time-loop.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index db1d01b3e..7eb206148 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -652,7 +652,7 @@ def push(self, interface):
         rate_tps_arr      = r_arr(Te_tps)
         rate_tps_csc      = r_csc(Te_tps)
         
-        rr_bte            = np.zeros_like(rate_tps_arr) 
+        rr_bte            = xp.zeros_like(rate_tps_arr) 
         gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
         
         for grid_idx in range(self.param.n_grids):
@@ -662,17 +662,17 @@ def push(self, interface):
             rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
         
         rr_bte[rr_bte<0] = 0.0 
-        s0  = rate_tps_arr * n0 
-        s1  = rate_tps_csc * n0 
+        s0  = rate_tps_arr * n0 * ni
+        s1  = rate_tps_csc * n0 * ni
         
-        s2  = rr_bte       * n0 
+        s2  = rr_bte       * n0 * ni
         
         tau = 1e-2
         idx = s2 > tau
         rate_bte[0][:]   =  0.0
         rate_bte[1][:]   =  0.0
         rate_bte[0]      = rr_bte
-        rate_bte[1][idx] = np.abs(1 - s1[idx]/s2[idx])
+        rate_bte[1][idx] = xp.abs(s2-s1)/xp.max(s2)
         
         return 
         

From 19e08fc405892779d7821a9606a8aed5774f214e Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 1 Nov 2023 09:36:43 -0500
Subject: [PATCH 13/75] minor fix

---
 src/tps-time-loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 7eb206148..ca6f2670f 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -667,12 +667,12 @@ def push(self, interface):
         
         s2  = rr_bte       * n0 * ni
         
-        tau = 1e-2
-        idx = s2 > tau
+        # tau = 1e-2
+        # idx = s2 > tau
         rate_bte[0][:]   =  0.0
         rate_bte[1][:]   =  0.0
         rate_bte[0]      = rr_bte
-        rate_bte[1][idx] = xp.abs(s2-s1)/xp.max(s2)
+        rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
         
         return 
         

From 6da87b18cb5732394fcd8904276a3562ecf3ac82 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 09:32:18 -0600
Subject: [PATCH 14/75] Add the dof index to the reaction interface/rename the
 the python bte solver

---
 src/M2ulPhyS.cpp     |   2 +-
 src/chemistry.cpp    |   8 +-
 src/chemistry.hpp    |   6 +-
 src/reaction.cpp     |   3 +
 src/reaction.hpp     |   4 +
 src/source_term.cpp  |   2 +-
 src/tps-bte_0d3v.py  | 727 +++++++++++++++++++++++++++++++++++++++++++
 src/tps-time-loop.py | 710 ++----------------------------------------
 test/test_table.cpp  |   2 +-
 9 files changed, 772 insertions(+), 692 deletions(-)
 create mode 100755 src/tps-bte_0d3v.py

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index a8aadc86d..da5d4eee7 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -4184,7 +4184,7 @@ void M2ulPhyS::updateVisualizationVariables() {
       Th = prim[1 + _nvel];
       Te = (in_mix->IsTwoTemperature()) ? prim[_num_equation - 1] : Th;
       double kfwd[gpudata::MAXREACTIONS], kC[gpudata::MAXREACTIONS];
-      in_chem->computeForwardRateCoeffs(Th, Te, kfwd);
+      in_chem->computeForwardRateCoeffs(Th, Te, n, kfwd);
       in_chem->computeEquilibriumConstants(Th, Te, kC);
       // get reaction rates
       double progressRates[gpudata::MAXREACTIONS];
diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index ea5e0c597..52e1744bd 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -105,6 +105,7 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
   }
 }
 
+#if 0 
 void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd) {
   kfwd.SetSize(numReactions_);
   computeForwardRateCoeffs(T_h, T_e, &kfwd[0]);
@@ -117,19 +118,21 @@ void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, V
 
   return;
 }
+#endif
 
-MFEM_HOST_DEVICE void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, double *kfwd) {
+MFEM_HOST_DEVICE void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, const int & dofindex, double *kfwd) {
   // kfwd.SetSize(numReactions_);
   for (int r = 0; r < numReactions_; r++) kfwd[r] = 0.0;
 
   for (int r = 0; r < numReactions_; r++) {
     bool isElectronInvolved = isElectronInvolvedAt(r);
-    kfwd[r] = reactions_[r]->computeRateCoefficient(T_h, T_e, isElectronInvolved);
+    kfwd[r] = reactions_[r]->computeRateCoefficient(T_h, T_e, dofindex, isElectronInvolved);
   }
 
   return;
 }
 
+#if 0
 // NOTE: if not detailedBalance, equilibrium constant is returned as zero, though it cannot be used.
 void Chemistry::computeEquilibriumConstants(const double &T_h, const double &T_e, Vector &kC) {
   kC.SetSize(numReactions_);
@@ -147,6 +150,7 @@ void Chemistry::computeEquilibriumConstants(const double &T_h, const double &T_e
 
   return;
 }
+#endif
 
 MFEM_HOST_DEVICE void Chemistry::computeEquilibriumConstants(const double &T_h, const double &T_e, double *kC) {
   for (int r = 0; r < numReactions_; r++) kC[r] = 0.0;
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 8a18e87b9..6b6af72f8 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -96,10 +96,10 @@ class Chemistry {
   // return Vector of reaction rate coefficients, with the size of numReaction_.
   // WARNING(marc) I have removed "virtual" qualifier here assuming these functions will not
   // change for child classes. Correct if wrong
-  void computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd);
-  MFEM_HOST_DEVICE void computeForwardRateCoeffs(const double &T_h, const double &T_e, double *kfwd);
+  //void computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd);
+  MFEM_HOST_DEVICE void computeForwardRateCoeffs(const double &T_h, const double &T_e, const int & dofindex, double *kfwd);
 
-  void computeEquilibriumConstants(const double &T_h, const double &T_e, Vector &kC);
+  //void computeEquilibriumConstants(const double &T_h, const double &T_e, Vector &kC);
   MFEM_HOST_DEVICE void computeEquilibriumConstants(const double &T_h, const double &T_e, double *kC);
 
   // return rate coefficients of (reactionIndex)-th reaction. (start from 0)
diff --git a/src/reaction.cpp b/src/reaction.cpp
index 54c8baa28..448d25515 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -39,6 +39,7 @@ MFEM_HOST_DEVICE Arrhenius::Arrhenius(const double &A, const double &b, const do
     : Reaction(), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double Arrhenius::computeRateCoefficient(const double &T_h, const double &T_e,
+                                                          [[maybe_unused]] const int & dofindex,
                                                           const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
 
@@ -49,6 +50,7 @@ MFEM_HOST_DEVICE HoffertLien::HoffertLien(const double &A, const double &b, cons
     : Reaction(), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double HoffertLien::computeRateCoefficient(const double &T_h, const double &T_e,
+                                                            [[maybe_unused]] const int & dofindex,
                                                             const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
   double tempFactor = E_ / BOLTZMANNCONSTANT / temp;
@@ -71,6 +73,7 @@ MFEM_HOST_DEVICE Tabulated::Tabulated(const TableInput &input) : Reaction() {
 MFEM_HOST_DEVICE Tabulated::~Tabulated() { delete table_; }
 
 MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, const double &T_e,
+                                                          [[maybe_unused]] const int & dofindex,
                                                           const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
   return table_->eval(temp);
diff --git a/src/reaction.hpp b/src/reaction.hpp
index a8507c2f2..4571c7161 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -58,6 +58,7 @@ class Reaction {
   MFEM_HOST_DEVICE virtual ~Reaction() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
+                                                         [[maybe_unused]] const int & dofindex,
                                                          const bool isElectronInvolved = false) {
     printf("computeRateCoefficient not implemented");
     return 0;
@@ -77,6 +78,7 @@ class Arrhenius : public Reaction {
   MFEM_HOST_DEVICE virtual ~Arrhenius() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
+                                                         [[maybe_unused]] const int & dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
@@ -97,6 +99,7 @@ class HoffertLien : public Reaction {
   MFEM_HOST_DEVICE virtual ~HoffertLien() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
+                                                         [[maybe_unused]] const int & dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
@@ -110,6 +113,7 @@ class Tabulated : public Reaction {
   MFEM_HOST_DEVICE virtual ~Tabulated();
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
+                                                         [[maybe_unused]] const int & dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
diff --git a/src/source_term.cpp b/src/source_term.cpp
index bbc7bcbdc..4c585f06c 100644
--- a/src/source_term.cpp
+++ b/src/source_term.cpp
@@ -162,7 +162,7 @@ void SourceTerm::updateTerms(mfem::Vector &in) {
     double progressRates[gpudata::MAXREACTIONS], creationRates[gpudata::MAXSPECIES];
     if (_numSpecies > 1 && _numReactions > 0) {
       double kfwd[gpudata::MAXREACTIONS], kC[gpudata::MAXREACTIONS];
-      _chemistry->computeForwardRateCoeffs(Th, Te, kfwd);
+      _chemistry->computeForwardRateCoeffs(Th, Te, n, kfwd);
       _chemistry->computeEquilibriumConstants(Th, Te, kC);
 
       // get reaction rates
diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
new file mode 100755
index 000000000..ca6f2670f
--- /dev/null
+++ b/src/tps-bte_0d3v.py
@@ -0,0 +1,727 @@
+#!/usr/bin/env python3
+import sys
+import os
+from mpi4py import MPI
+import numpy as np
+import scipy.constants
+import csv
+import matplotlib.pyplot as plt
+from time import perf_counter as time
+import configparser
+import cupy as cp
+import enum
+import pandas as pd
+import scipy.interpolate
+
+class profile_t:
+    def __init__(self,name):
+        self.name = name
+        self.seconds=0
+        self.snap=0
+        self._pri_time =0
+        self.iter =0
+
+    def __add__(self,o):
+        assert(self.name==o.name)
+        self.seconds+=o.seconds
+        self.snap+=o.snap
+        self.iter+=o.iter
+        return self
+
+    def start(self):
+        self._pri_time = time()
+    
+    def stop(self):
+        self.seconds-=self._pri_time
+        self.snap=-self._pri_time
+
+        self._pri_time = time()
+
+        self.seconds +=self._pri_time
+        self.snap  += self._pri_time
+        self.iter+=1
+    
+    def reset(self):
+        self.seconds=0
+        self.snap=0
+        self._pri_time =0
+        self.iter =0
+
+def min_mean_max(a, comm: MPI.Comm):
+    return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
+
+
+try:
+    df    = pd.read_csv("ionization_rates.csv")
+    Te    = np.array(df["Te[K]"]) 
+    r_arr = np.array(df["Arr[m3/s]"])
+    r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
+    r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
+    r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
+    print("ionization coefficient read from file ")
+except:
+    print("ionization rate coefficient file not found!!")
+    r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+    r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+
+# set path to C++ TPS library
+path = os.path.abspath(os.path.dirname(sys.argv[0]))
+sys.path.append(path + "/.libs")
+sys.path.append(path + "/../../boltzmann/BESolver/python")
+import libtps
+from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
+
+WITH_PARLA = 1
+if WITH_PARLA:
+    try:
+        from parla import Parla
+        from parla.tasks import spawn, TaskSpace
+        from parla.devices import cpu, gpu
+    except:
+        print("Error occured during Parla import. Please make sure Parla is installed properly.")
+        sys.exit(0)
+
+
+class pp(enum.IntEnum):
+    SETUP         = 0
+    SOLVE         = 1
+    LAST          = 2
+
+class BoltzmannSolverParams():
+    sp_order      = 3           # B-spline order in v-space
+    spline_qpts   = 5           # number of Gauss-Legendre quadrature points per knot interval    
+    Nr            = 127         # number of B-splines used in radial direction
+    l_max         = 1           # spherical modes uses, 0, to l_max
+    ev_max        = 16          # v-space grid truncation (eV)
+    n_grids       = 4           # number of v-space grids
+
+    dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
+    cycles        = 10             # number of max cycles to evolve
+    solver_type   = "transient" # two modes, "transient" or "steady-state"
+    atol          = 1e-10       # absolute tolerance
+    rtol          = 1e-10       # relative tolerance
+    max_iter      = 1000        # max iterations for the newton solver
+
+    ee_collisions = 0           # enable electron-electron Coulombic effects
+    use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
+    dev_id        = 0           # which GPU device to use only used when use_gpu=1
+
+    collisions    = ["g0","g2"] # collision string g0-elastic, g2-ionization
+    export_csv    = 1           # export the qois to csv file
+    plot_data     = 1
+    
+    Efreq         = 0.0 #[1/s]  # E-field osicllation frequency
+    verbose       = 1           # verbose output for the BTE solver
+    Te            = 0.5 #[eV]   # approximate electron temperature
+    
+    threads       = 16          # number of threads to use to assemble operators
+    grid_idx      = 0
+    
+    output_dir    = "batched_bte1"
+    out_fname     = output_dir + "/tps"
+    
+    # some useful units and conversion factors. 
+    ev_to_K       = (scipy.constants.electron_volt/scipy.constants.Boltzmann) 
+    Td_fac        = 1e-21 #[Vm^2]
+    c_gamma       = np.sqrt(2 * scipy.constants.elementary_charge / scipy.constants.electron_mass) #[(C/kg)^{1/2}]
+    me            = scipy.constants.electron_mass
+    kB            = scipy.constants.Boltzmann
+    
+class TPSINDEX():
+    """
+    simple index map to differnt fields, from the TPS arrays
+    """
+    ION_IDX = 0                         # ion      density index
+    ELE_IDX = 1                         # electron density index
+    NEU_IDX = 2                         # neutral  density index
+    
+    EF_RE_IDX = 0                       # Re(E) index
+    EF_IM_IDX = 1                       # Im(E) index
+    
+class Boltzmann0D2VBactchedSolver:
+    
+    def __init__(self, tps, comm):
+        self.tps   = tps
+        self.comm : MPI.Comm  = comm
+        self.param = BoltzmannSolverParams()
+        # overide the default params, based on the config.ini file.
+        self.parse_config_file(sys.argv[2])
+        
+        self.xp_module          = np
+        
+        boltzmann_dir           = self.param.output_dir
+        isExist = os.path.exists(boltzmann_dir)
+        if not isExist:
+           # Create a new directory because it does not exist
+           os.makedirs(boltzmann_dir)
+           #print("directory %s is created!"%(dir_name))
+           
+        profile_tt  = [None] * int(pp.LAST)
+        profile_nn  = ["setup", "solve", "last"]
+        for i in range(pp.LAST):
+            profile_tt[i] = profile_t(profile_nn[i])
+        
+        self.profile_tt = profile_tt
+        self.profile_nn = profile_nn
+
+        return
+    
+    def parse_config_file(self, fname):
+        """
+        add the configuaraion file parse code here, 
+        which overides the default BoltzmannSolverParams
+        """
+        config = configparser.ConfigParser()
+        print("[Boltzmann] reading configure file given by : ", fname)
+        config.read(fname)
+        
+        self.param.sp_order         = int(config.get("boltzmannSolver", "sp_order").split("#")[0].strip())
+        self.param.spline_qpts      = int(config.get("boltzmannSolver", "spline_qpts").split("#")[0].strip())
+        
+        self.param.Nr               = int(config.get("boltzmannSolver", "Nr").split("#")[0].strip())
+        self.param.l_max            = int(config.get("boltzmannSolver", "l_max").split("#")[0].strip())
+        self.param.n_grids          = int(config.get("boltzmannSolver", "n_grids").split("#")[0].strip())
+        self.param.dt               = float(config.get("boltzmannSolver", "dt").split("#")[0].strip())
+        self.param.cycles           = float(config.get("boltzmannSolver", "cycles").split("#")[0].strip())
+        self.param.solver_type      = str(config.get("boltzmannSolver", "solver_type").split("#")[0].strip()) 
+        self.param.atol             = float(config.get("boltzmannSolver", "atol").split("#")[0].strip())
+        self.param.rtol             = float(config.get("boltzmannSolver", "rtol").split("#")[0].strip())
+        self.param.max_iter         = int(config.get("boltzmannSolver", "max_iter").split("#")[0].strip())
+        self.param.ee_collisions    = int(config.get("boltzmannSolver", "ee_collisions").split("#")[0].strip())
+        self.param.use_gpu          = int(config.get("boltzmannSolver", "use_gpu").split("#")[0].strip())
+        #self.param.collisions       = config.get("boltzmannSolver", "collisions").split("#")[0]
+        
+        self.param.export_csv       = int(config.get("boltzmannSolver", "export_csv").split("#")[0].strip())
+        self.param.plot_data        = int(config.get("boltzmannSolver", "plot_data").split("#")[0].strip())
+        self.param.Efreq            = float(config.get("boltzmannSolver", "Efreq").split("#")[0].strip())
+        self.param.verbose          = int(config.get("boltzmannSolver", "verbose").split("#")[0].strip())
+        self.param.Te               = float(config.get("boltzmannSolver", "Te").split("#")[0].strip())
+
+        self.param.threads          = int(config.get("boltzmannSolver", "threads").split("#")[0].strip())
+        self.param.output_dir       = str(config.get("boltzmannSolver", "output_dir").split("#")[0].strip())
+        self.param.out_fname        = self.param.output_dir + "/" + str(config.get("boltzmannSolver", "output_fname").split("#")[0].strip())
+        return 
+    
+    def grid_setup(self, interface):
+        """
+        Perform the boltzmann grid setup. 
+        we generate v-space grid for each spatial point cluster in the parameter space, 
+        where, at the moment the clustering is determined based on the electron temperature
+        computed from the TPS code. 
+        """
+        
+        self.profile_tt[pp.SETUP].start()
+        
+        xp                = self.xp_module
+        Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
+        Te_min, Te_max    = xp.min(Te), xp.max(Te)
+        Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
+        dist_mat          = xp.zeros((len(Te), self.param.n_grids))
+        
+        for iter in range(50):
+            #print("clustering iteration ", iter, Te_b)
+            for i in range(self.param.n_grids):
+                dist_mat[:,i] = xp.abs(Te-Te_b[i])
+            
+            membership = xp.argmin(dist_mat, axis=1)
+            Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
+            rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
+            Te_b       = Te_b1
+           
+            if rel_error < 1e-4:
+                break
+        Te_b = np.sort(Te_b)
+        print("K-means Te clusters ", Te_b)                
+        for i in range(self.param.n_grids):
+            dist_mat[:,i] = xp.abs(Te-Te_b[i])
+        
+        membership = xp.argmin(dist_mat, axis=1)
+        grid_idx_to_spatial_pts_map = list()
+        for b_idx in range(self.param.n_grids):
+            #grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
+            grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
+        
+        np.save("%s_gidx_to_pidx.npy"%(self.param.out_fname), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
+        
+        self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
+        self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
+        
+        xp.sum(self.grid_idx_to_npts) == len(Te), "[Error] : TPS spatial points for v-space grid assignment is inconsitant"
+        lm_modes                         = [[[l,0] for l in range(self.param.l_max+1)] for grid_idx in range(self.param.n_grids)]
+        nr                               = xp.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
+        Te                               = xp.array([Te_b[b_idx]  for b_idx in range(self.param.n_grids)]) # xp.ones(self.param.n_grids) * self.param.Te 
+        vth                              = np.sqrt(2* self.param.kB * Te * self.param.ev_to_K  /self.param.me)
+        ev_max                           = (6 * vth / self.param.c_gamma)**2 
+        self.bte_solver                  = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
+
+        if self.param.verbose==1:
+            print("grid energy max (eV) \n", ev_max, flush = True)
+        
+        # compute BTE operators
+        for grid_idx in range(self.param.n_grids):
+            print("setting up grid %d"%(grid_idx), flush = True)
+            self.bte_solver.assemble_operators(grid_idx)
+        
+        self.profile_tt[pp.SETUP].stop()
+        return
+        
+    def fetch(self, interface):
+        xp                = self.xp_module
+        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
+        
+        heavy_temp        = xp.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts          = len(heavy_temp)
+        self.tps_npts     = tps_npts
+        
+        electron_temp     = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        efield            = xp.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
+        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
+        
+        for grid_idx in range(self.param.n_grids):
+            bte_idx           = gidx_to_pidx_map[grid_idx]
+            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
+            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
+            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
+            Tg                = heavy_temp[bte_idx]
+            Te                = electron_temp[bte_idx]
+            
+            
+            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
+            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
+            eMag              = np.sqrt(eRe**2 + eIm **2)
+            eByn0             = eMag/n0/self.param.Td_fac
+        
+            if self.param.verbose == 1 :
+                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                
+                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
+                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
+                
+                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
+                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
+                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
+            
+            #self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0", n0)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne", ne)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni", ni)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg", Tg)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eRe)
+            
+        return        
+
+    def solve(self):
+        """
+        perform the BTE solve, supports both stead-state solution (static E-field) 
+        and time-periodic solutions for the oscillatory E-fields
+        """
+        
+        if WITH_PARLA==1:
+            self.solve_with_parla()
+            return
+        else:
+            self.solve_seq()
+            return
+        
+    def solve_seq(self):
+        xp               = self.xp_module
+        csv_write        = self.param.export_csv
+        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        
+        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
+        self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        
+        if csv_write ==1 : 
+            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
+        
+        t1 = time()
+        
+        for grid_idx in range(self.param.n_grids):
+            
+            if self.grid_idx_to_npts[grid_idx] ==0:
+                continue
+            
+            if self.param.verbose==1:
+                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
+                f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+            
+            if self.param.use_gpu==1:
+                dev_id   = self.param.dev_id
+                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+                
+                with cp.cuda.Device(dev_id):
+                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+
+                    if self.param.Efreq == 0:
+                        ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                    else:
+                        ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                        
+            else:
+                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+            
+                if self.param.Efreq == 0:
+                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                else:
+                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                            
+            self.bte_solver.set_efield_function(grid_idx, ef_t)            
+            f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+            try:
+                ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                self.qoi[grid_idx] = qoi
+                self.ff [grid_idx] = ff
+            except:
+                print("solver failed for v-space gird no %d"%(grid_idx))
+                # self.qoi.append(None)
+                # continue
+                sys.exit(0)
+            
+            if self.param.export_csv ==0 and self.param.plot_data==0:
+                continue
+            
+            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
+
+            if self.param.use_gpu==1:
+                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
+                
+            with cp.cuda.Device(dev_id):
+                ff_r     = cp.asnumpy(ff_r)
+                for k, v in qoi.items():
+                    qoi[k] = cp.asnumpy(v)
+                    
+            if csv_write==1:
+                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
+                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
+                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
+                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
+                
+                for col_idx, g in enumerate(self.param.collisions):
+                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
+                    
+            plot_data    = self.param.plot_data
+            if plot_data:
+                
+                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                eMag  = np.sqrt(eRe**2 + eIm**2)
+                
+                num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                num_subplots = num_sh 
+                num_plt_cols = min(num_sh, 4)
+                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                plt_idx      =  1
+                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
+
+                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                        fr = np.abs(ff_r[ii, lm_idx, :])
+                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                    
+                    plt.xlabel(r"energy (eV)")
+                    plt.ylabel(r"$f_%d$"%(lm[0]))
+                    plt.grid(visible=True)
+                    if lm_idx==0:
+                        plt.legend(prop={'size': 6})
+                        
+                    plt_idx +=1
+                
+                #plt_idx = num_sh
+                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                plt.close()
+        
+        t2 = time()
+        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
+        
+        if csv_write:
+            fname    = self.param.out_fname
+            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                
+                writer.writerow(header)
+                writer.writerows(data_csv)
+
+        return
+    
+    def solve_with_parla(self):
+        csv_write        = self.param.export_csv
+        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
+        self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        
+        if csv_write ==1 : 
+            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
+        
+        
+        rank = self.comm.Get_rank()
+        npes = self.comm.Get_size()
+        
+        with Parla():
+            num_gpus         = len(gpu)
+            grid_to_device_map = lambda gidx : gidx % num_gpus
+            @spawn(placement=cpu, vcus=0)
+            async def __main__():
+                self.profile_tt[pp.SETUP].start()
+                ts_0 = TaskSpace("T")
+                for grid_idx in range(self.param.n_grids):
+                    @spawn(ts_0[grid_idx], placement=[cpu], vcus=0.0)
+                    def t0():
+                        print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
+                        f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+                        
+                        if self.param.use_gpu == 1:
+                            dev_id  = grid_to_device_map(grid_idx)
+                            self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+                            xp      = cp
+
+                            with cp.cuda.Device(dev_id):
+                                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+            
+                                if self.param.Efreq == 0:
+                                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                                else:
+                                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                        else:
+                            xp = np
+                            eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                            eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+        
+                            if self.param.Efreq == 0:
+                                ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
+                            else:
+                                ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                                    
+                        self.bte_solver.set_efield_function(grid_idx, ef_t)
+                        return
+                
+                await ts_0
+                
+                self.profile_tt[pp.SETUP].stop()
+                if self.param.use_gpu==1:
+                    p1 = [gpu(grid_to_device_map(grid_idx)) for grid_idx in range(self.param.n_grids)]
+                else:
+                    p1 = [cpu for grid_idx in range(self.param.n_grids)]
+                
+                self.profile_tt[pp.SOLVE].start()
+                ts_1 = TaskSpace("T")
+                for grid_idx in range(self.param.n_grids):
+                    @spawn(ts_1[grid_idx], placement=[p1[grid_idx]], dependencies=ts_0[grid_idx], vcus=0.0)
+                    def t1():
+                        f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, p1[grid_idx]))
+                        try:
+                            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                            self.ff[grid_idx]  = ff
+                            self.qoi[grid_idx] = qoi
+                        except:
+                            print("solver failed for v-space gird no %d"%(grid_idx))
+                            # self.qoi.append(None)
+                            # continue
+                            sys.exit(0)
+                            
+                await ts_1
+                self.profile_tt[pp.SOLVE].stop()
+        
+        
+        t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
+        t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
+        print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
+        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))        
+        if self.param.export_csv ==0 and self.param.plot_data==0:
+            return
+        
+        for grid_idx in range(self.param.n_grids):
+            dev_id = grid_idx % num_gpus
+            
+            if self.param.use_gpu==1:
+                gpu_id = cp.cuda.Device(dev_id)
+                gpu_id.use()
+            
+            ff       = self.ff[grid_idx]
+            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
+
+            if self.param.use_gpu==1:
+                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
+                
+                qoi = self.qoi[grid_idx]    
+                with cp.cuda.Device(dev_id):
+                    ff_r     = cp.asnumpy(ff_r)
+                    for k, v in qoi.items():
+                        qoi[k] = cp.asnumpy(v)
+                    
+            if csv_write==1:
+                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
+                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
+                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
+                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
+                
+                for col_idx, g in enumerate(self.param.collisions):
+                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
+
+            plot_data    = self.param.plot_data
+            if plot_data:
+                
+                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
+                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
+                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
+                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                eMag  = np.sqrt(eRe**2 + eIm**2)
+                
+                num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                num_subplots = num_sh 
+                num_plt_cols = min(num_sh, 4)
+                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                plt_idx      =  1
+                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
+
+                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                        fr = np.abs(ff_r[ii, lm_idx, :])
+                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                    
+                    plt.xlabel(r"energy (eV)")
+                    plt.ylabel(r"$f_%d$"%(lm[0]))
+                    plt.grid(visible=True)
+                    if lm_idx==0:
+                        plt.legend(prop={'size': 6})
+                        
+                    plt_idx +=1
+                
+                #plt_idx = num_sh
+                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                plt.close()
+        
+        if csv_write:
+            fname    = self.param.out_fname
+            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                
+                writer.writerow(header)
+                writer.writerows(data_csv)
+       
+    def push(self, interface):
+        xp                = self.xp_module
+        Te_bte            = xp.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
+        rate_bte          = xp.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
+        Te_tps            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        
+        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, self.tps_npts)
+        ni                = species_densities[TPSINDEX.ION_IDX]
+        n0                = species_densities[TPSINDEX.NEU_IDX]
+        ne                = species_densities[TPSINDEX.ELE_IDX]
+        
+        rate_tps_arr      = r_arr(Te_tps)
+        rate_tps_csc      = r_csc(Te_tps)
+        
+        rr_bte            = xp.zeros_like(rate_tps_arr) 
+        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
+        
+        for grid_idx in range(self.param.n_grids):
+            Te_bte[gidx_to_pidx_map[grid_idx]]        = (self.qoi[grid_idx]["energy"]/1.5) * self.param.ev_to_K
+            rr                                        = self.qoi[grid_idx]["rates"]
+            # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
+            rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
+        
+        rr_bte[rr_bte<0] = 0.0 
+        s0  = rate_tps_arr * n0 * ni
+        s1  = rate_tps_csc * n0 * ni
+        
+        s2  = rr_bte       * n0 * ni
+        
+        # tau = 1e-2
+        # idx = s2 > tau
+        rate_bte[0][:]   =  0.0
+        rate_bte[1][:]   =  0.0
+        rate_bte[0]      = rr_bte
+        rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
+        
+        return 
+        
+
+
+
+
+comm = MPI.COMM_WORLD
+# TPS solver
+tps = libtps.Tps(comm)
+
+tps.parseCommandLineArgs(sys.argv)
+tps.parseInput()
+tps.chooseDevices()
+tps.chooseSolver()
+tps.initialize()
+
+boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
+
+interface = libtps.Tps2Boltzmann(tps)
+tps.initInterface(interface)
+
+coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
+print(coords.shape)
+
+it = 0
+max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
+print("Max Iters: ", max_iters)
+tps.solveBegin()
+tps.solveStep()
+tps.push(interface)
+boltzmann.grid_setup(interface)
+boltzmann.fetch(interface)
+boltzmann.solve()
+boltzmann.push(interface)
+tps.fetch(interface)
+
+# while it < max_iters:
+#     tps.solveStep()
+#     tps.push(interface)
+#     boltzmann.fetch(interface)
+#     boltzmann.solve()
+#     boltzmann.push(interface)
+#     tps.fetch(interface)
+    
+#     it = it+1
+#     print("it, ", it)
+
+tps.solveEnd()
+
+
+sys.exit (tps.getStatus())
diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index ca6f2670f..9bbc719d0 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -1,684 +1,36 @@
 #!/usr/bin/env python3
 import sys
 import os
-from mpi4py import MPI
 import numpy as np
-import scipy.constants
-import csv
-import matplotlib.pyplot as plt
-from time import perf_counter as time
-import configparser
-import cupy as cp
-import enum
-import pandas as pd
-import scipy.interpolate
 
-class profile_t:
-    def __init__(self,name):
-        self.name = name
-        self.seconds=0
-        self.snap=0
-        self._pri_time =0
-        self.iter =0
+from mpi4py import MPI
 
-    def __add__(self,o):
-        assert(self.name==o.name)
-        self.seconds+=o.seconds
-        self.snap+=o.snap
-        self.iter+=o.iter
-        return self
+class BoltzmannMockSolver:
+    def __init__(self):
+        pass
 
-    def start(self):
-        self._pri_time = time()
-    
-    def stop(self):
-        self.seconds-=self._pri_time
-        self.snap=-self._pri_time
+    def fetch(self, interface):
+        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False)
+        efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
+        heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
 
-        self._pri_time = time()
+        print("|| species_densities ||_2 = ", np.linalg.norm(species_densities) )
+        print("|| efield ||_2 = ", np.linalg.norm(efield) )
+        print("||heavy_temperature||_2 = ", np.linalg.norm(heavy_temperature) )
 
-        self.seconds +=self._pri_time
-        self.snap  += self._pri_time
-        self.iter+=1
-    
-    def reset(self):
-        self.seconds=0
-        self.snap=0
-        self._pri_time =0
-        self.iter =0
+    def solve(self):
+        pass
 
-def min_mean_max(a, comm: MPI.Comm):
-    return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
+    def push(self, interface):
+        electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
+        electron_temperature[:] = 1.
 
 
-try:
-    df    = pd.read_csv("ionization_rates.csv")
-    Te    = np.array(df["Te[K]"]) 
-    r_arr = np.array(df["Arr[m3/s]"])
-    r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
-    r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
-    r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
-    print("ionization coefficient read from file ")
-except:
-    print("ionization rate coefficient file not found!!")
-    r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
-    r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
 
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
 sys.path.append(path + "/.libs")
-sys.path.append(path + "/../../boltzmann/BESolver/python")
 import libtps
-from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
-
-WITH_PARLA = 1
-if WITH_PARLA:
-    try:
-        from parla import Parla
-        from parla.tasks import spawn, TaskSpace
-        from parla.devices import cpu, gpu
-    except:
-        print("Error occured during Parla import. Please make sure Parla is installed properly.")
-        sys.exit(0)
-
-
-class pp(enum.IntEnum):
-    SETUP         = 0
-    SOLVE         = 1
-    LAST          = 2
-
-class BoltzmannSolverParams():
-    sp_order      = 3           # B-spline order in v-space
-    spline_qpts   = 5           # number of Gauss-Legendre quadrature points per knot interval    
-    Nr            = 127         # number of B-splines used in radial direction
-    l_max         = 1           # spherical modes uses, 0, to l_max
-    ev_max        = 16          # v-space grid truncation (eV)
-    n_grids       = 4           # number of v-space grids
-
-    dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
-    cycles        = 10             # number of max cycles to evolve
-    solver_type   = "transient" # two modes, "transient" or "steady-state"
-    atol          = 1e-10       # absolute tolerance
-    rtol          = 1e-10       # relative tolerance
-    max_iter      = 1000        # max iterations for the newton solver
-
-    ee_collisions = 0           # enable electron-electron Coulombic effects
-    use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
-    dev_id        = 0           # which GPU device to use only used when use_gpu=1
-
-    collisions    = ["g0","g2"] # collision string g0-elastic, g2-ionization
-    export_csv    = 1           # export the qois to csv file
-    plot_data     = 1
-    
-    Efreq         = 0.0 #[1/s]  # E-field osicllation frequency
-    verbose       = 1           # verbose output for the BTE solver
-    Te            = 0.5 #[eV]   # approximate electron temperature
-    
-    threads       = 16          # number of threads to use to assemble operators
-    grid_idx      = 0
-    
-    output_dir    = "batched_bte1"
-    out_fname     = output_dir + "/tps"
-    
-    # some useful units and conversion factors. 
-    ev_to_K       = (scipy.constants.electron_volt/scipy.constants.Boltzmann) 
-    Td_fac        = 1e-21 #[Vm^2]
-    c_gamma       = np.sqrt(2 * scipy.constants.elementary_charge / scipy.constants.electron_mass) #[(C/kg)^{1/2}]
-    me            = scipy.constants.electron_mass
-    kB            = scipy.constants.Boltzmann
-    
-class TPSINDEX():
-    """
-    simple index map to differnt fields, from the TPS arrays
-    """
-    ION_IDX = 0                         # ion      density index
-    ELE_IDX = 1                         # electron density index
-    NEU_IDX = 2                         # neutral  density index
-    
-    EF_RE_IDX = 0                       # Re(E) index
-    EF_IM_IDX = 1                       # Im(E) index
-    
-class Boltzmann0D2VBactchedSolver:
-    
-    def __init__(self, tps, comm):
-        self.tps   = tps
-        self.comm : MPI.Comm  = comm
-        self.param = BoltzmannSolverParams()
-        # overide the default params, based on the config.ini file.
-        self.parse_config_file(sys.argv[2])
-        
-        self.xp_module          = np
-        
-        boltzmann_dir           = self.param.output_dir
-        isExist = os.path.exists(boltzmann_dir)
-        if not isExist:
-           # Create a new directory because it does not exist
-           os.makedirs(boltzmann_dir)
-           #print("directory %s is created!"%(dir_name))
-           
-        profile_tt  = [None] * int(pp.LAST)
-        profile_nn  = ["setup", "solve", "last"]
-        for i in range(pp.LAST):
-            profile_tt[i] = profile_t(profile_nn[i])
-        
-        self.profile_tt = profile_tt
-        self.profile_nn = profile_nn
-
-        return
-    
-    def parse_config_file(self, fname):
-        """
-        add the configuaraion file parse code here, 
-        which overides the default BoltzmannSolverParams
-        """
-        config = configparser.ConfigParser()
-        print("[Boltzmann] reading configure file given by : ", fname)
-        config.read(fname)
-        
-        self.param.sp_order         = int(config.get("boltzmannSolver", "sp_order").split("#")[0].strip())
-        self.param.spline_qpts      = int(config.get("boltzmannSolver", "spline_qpts").split("#")[0].strip())
-        
-        self.param.Nr               = int(config.get("boltzmannSolver", "Nr").split("#")[0].strip())
-        self.param.l_max            = int(config.get("boltzmannSolver", "l_max").split("#")[0].strip())
-        self.param.n_grids          = int(config.get("boltzmannSolver", "n_grids").split("#")[0].strip())
-        self.param.dt               = float(config.get("boltzmannSolver", "dt").split("#")[0].strip())
-        self.param.cycles           = float(config.get("boltzmannSolver", "cycles").split("#")[0].strip())
-        self.param.solver_type      = str(config.get("boltzmannSolver", "solver_type").split("#")[0].strip()) 
-        self.param.atol             = float(config.get("boltzmannSolver", "atol").split("#")[0].strip())
-        self.param.rtol             = float(config.get("boltzmannSolver", "rtol").split("#")[0].strip())
-        self.param.max_iter         = int(config.get("boltzmannSolver", "max_iter").split("#")[0].strip())
-        self.param.ee_collisions    = int(config.get("boltzmannSolver", "ee_collisions").split("#")[0].strip())
-        self.param.use_gpu          = int(config.get("boltzmannSolver", "use_gpu").split("#")[0].strip())
-        #self.param.collisions       = config.get("boltzmannSolver", "collisions").split("#")[0]
-        
-        self.param.export_csv       = int(config.get("boltzmannSolver", "export_csv").split("#")[0].strip())
-        self.param.plot_data        = int(config.get("boltzmannSolver", "plot_data").split("#")[0].strip())
-        self.param.Efreq            = float(config.get("boltzmannSolver", "Efreq").split("#")[0].strip())
-        self.param.verbose          = int(config.get("boltzmannSolver", "verbose").split("#")[0].strip())
-        self.param.Te               = float(config.get("boltzmannSolver", "Te").split("#")[0].strip())
-
-        self.param.threads          = int(config.get("boltzmannSolver", "threads").split("#")[0].strip())
-        self.param.output_dir       = str(config.get("boltzmannSolver", "output_dir").split("#")[0].strip())
-        self.param.out_fname        = self.param.output_dir + "/" + str(config.get("boltzmannSolver", "output_fname").split("#")[0].strip())
-        return 
-    
-    def grid_setup(self, interface):
-        """
-        Perform the boltzmann grid setup. 
-        we generate v-space grid for each spatial point cluster in the parameter space, 
-        where, at the moment the clustering is determined based on the electron temperature
-        computed from the TPS code. 
-        """
-        
-        self.profile_tt[pp.SETUP].start()
-        
-        xp                = self.xp_module
-        Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
-        Te_min, Te_max    = xp.min(Te), xp.max(Te)
-        Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
-        dist_mat          = xp.zeros((len(Te), self.param.n_grids))
-        
-        for iter in range(50):
-            #print("clustering iteration ", iter, Te_b)
-            for i in range(self.param.n_grids):
-                dist_mat[:,i] = xp.abs(Te-Te_b[i])
-            
-            membership = xp.argmin(dist_mat, axis=1)
-            Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
-            rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
-            Te_b       = Te_b1
-           
-            if rel_error < 1e-4:
-                break
-        Te_b = np.sort(Te_b)
-        print("K-means Te clusters ", Te_b)                
-        for i in range(self.param.n_grids):
-            dist_mat[:,i] = xp.abs(Te-Te_b[i])
-        
-        membership = xp.argmin(dist_mat, axis=1)
-        grid_idx_to_spatial_pts_map = list()
-        for b_idx in range(self.param.n_grids):
-            #grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
-            grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
-        
-        np.save("%s_gidx_to_pidx.npy"%(self.param.out_fname), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
-        
-        self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
-        self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
-        
-        xp.sum(self.grid_idx_to_npts) == len(Te), "[Error] : TPS spatial points for v-space grid assignment is inconsitant"
-        lm_modes                         = [[[l,0] for l in range(self.param.l_max+1)] for grid_idx in range(self.param.n_grids)]
-        nr                               = xp.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
-        Te                               = xp.array([Te_b[b_idx]  for b_idx in range(self.param.n_grids)]) # xp.ones(self.param.n_grids) * self.param.Te 
-        vth                              = np.sqrt(2* self.param.kB * Te * self.param.ev_to_K  /self.param.me)
-        ev_max                           = (6 * vth / self.param.c_gamma)**2 
-        self.bte_solver                  = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
-
-        if self.param.verbose==1:
-            print("grid energy max (eV) \n", ev_max, flush = True)
-        
-        # compute BTE operators
-        for grid_idx in range(self.param.n_grids):
-            print("setting up grid %d"%(grid_idx), flush = True)
-            self.bte_solver.assemble_operators(grid_idx)
-        
-        self.profile_tt[pp.SETUP].stop()
-        return
-        
-    def fetch(self, interface):
-        xp                = self.xp_module
-        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
-        
-        heavy_temp        = xp.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
-        tps_npts          = len(heavy_temp)
-        self.tps_npts     = tps_npts
-        
-        electron_temp     = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
-        efield            = xp.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
-        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
-        
-        for grid_idx in range(self.param.n_grids):
-            bte_idx           = gidx_to_pidx_map[grid_idx]
-            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
-            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
-            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
-            Tg                = heavy_temp[bte_idx]
-            Te                = electron_temp[bte_idx]
-            
-            
-            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
-            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
-            eMag              = np.sqrt(eRe**2 + eIm **2)
-            eByn0             = eMag/n0/self.param.Td_fac
-        
-            if self.param.verbose == 1 :
-                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
-                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
-                
-                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
-                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
-                
-                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
-                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
-                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
-            
-            #self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0", n0)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne", ne)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni", ni)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg", Tg)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eRe)
-            
-        return        
-
-    def solve(self):
-        """
-        perform the BTE solve, supports both stead-state solution (static E-field) 
-        and time-periodic solutions for the oscillatory E-fields
-        """
-        
-        if WITH_PARLA==1:
-            self.solve_with_parla()
-            return
-        else:
-            self.solve_seq()
-            return
-        
-    def solve_seq(self):
-        xp               = self.xp_module
-        csv_write        = self.param.export_csv
-        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
-        
-        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
-        self.ff          = [None for grid_idx in range(self.param.n_grids)]
-        
-        if csv_write ==1 : 
-            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
-        
-        t1 = time()
-        
-        for grid_idx in range(self.param.n_grids):
-            
-            if self.grid_idx_to_npts[grid_idx] ==0:
-                continue
-            
-            if self.param.verbose==1:
-                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-                f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
-                self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
-            
-            if self.param.use_gpu==1:
-                dev_id   = self.param.dev_id
-                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
-                
-                with cp.cuda.Device(dev_id):
-                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-
-                    if self.param.Efreq == 0:
-                        ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                    else:
-                        ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                        
-            else:
-                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-            
-                if self.param.Efreq == 0:
-                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                else:
-                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                            
-            self.bte_solver.set_efield_function(grid_idx, ef_t)            
-            f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-            try:
-                ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                self.qoi[grid_idx] = qoi
-                self.ff [grid_idx] = ff
-            except:
-                print("solver failed for v-space gird no %d"%(grid_idx))
-                # self.qoi.append(None)
-                # continue
-                sys.exit(0)
-            
-            if self.param.export_csv ==0 and self.param.plot_data==0:
-                continue
-            
-            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
-            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
-
-            if self.param.use_gpu==1:
-                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
-                
-            with cp.cuda.Device(dev_id):
-                ff_r     = cp.asnumpy(ff_r)
-                for k, v in qoi.items():
-                    qoi[k] = cp.asnumpy(v)
-                    
-            if csv_write==1:
-                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
-                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
-                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
-                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
-                
-                for col_idx, g in enumerate(self.param.collisions):
-                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
-                    
-            plot_data    = self.param.plot_data
-            if plot_data:
-                
-                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                
-                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-                eMag  = np.sqrt(eRe**2 + eIm**2)
-                
-                num_sh       = len(self.bte_solver._par_lm[grid_idx])
-                num_subplots = num_sh 
-                num_plt_cols = min(num_sh, 4)
-                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-                plt_idx      =  1
-                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
-
-                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
-                        fr = np.abs(ff_r[ii, lm_idx, :])
-                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
-                    
-                    plt.xlabel(r"energy (eV)")
-                    plt.ylabel(r"$f_%d$"%(lm[0]))
-                    plt.grid(visible=True)
-                    if lm_idx==0:
-                        plt.legend(prop={'size': 6})
-                        
-                    plt_idx +=1
-                
-                #plt_idx = num_sh
-                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
-                plt.close()
-        
-        t2 = time()
-        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
-        
-        if csv_write:
-            fname    = self.param.out_fname
-            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                writer = csv.writer(f,delimiter=',')
-                # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
-                
-                writer.writerow(header)
-                writer.writerows(data_csv)
-
-        return
-    
-    def solve_with_parla(self):
-        csv_write        = self.param.export_csv
-        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
-        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
-        self.ff          = [None for grid_idx in range(self.param.n_grids)]
-        
-        if csv_write ==1 : 
-            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
-        
-        
-        rank = self.comm.Get_rank()
-        npes = self.comm.Get_size()
-        
-        with Parla():
-            num_gpus         = len(gpu)
-            grid_to_device_map = lambda gidx : gidx % num_gpus
-            @spawn(placement=cpu, vcus=0)
-            async def __main__():
-                self.profile_tt[pp.SETUP].start()
-                ts_0 = TaskSpace("T")
-                for grid_idx in range(self.param.n_grids):
-                    @spawn(ts_0[grid_idx], placement=[cpu], vcus=0.0)
-                    def t0():
-                        print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-                        f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
-                        
-                        if self.param.use_gpu == 1:
-                            dev_id  = grid_to_device_map(grid_idx)
-                            self.bte_solver.host_to_device_setup(dev_id, grid_idx)
-                            xp      = cp
-
-                            with cp.cuda.Device(dev_id):
-                                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-            
-                                if self.param.Efreq == 0:
-                                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                                else:
-                                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                        else:
-                            xp = np
-                            eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                            eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-        
-                            if self.param.Efreq == 0:
-                                ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                            else:
-                                ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                                    
-                        self.bte_solver.set_efield_function(grid_idx, ef_t)
-                        return
-                
-                await ts_0
-                
-                self.profile_tt[pp.SETUP].stop()
-                if self.param.use_gpu==1:
-                    p1 = [gpu(grid_to_device_map(grid_idx)) for grid_idx in range(self.param.n_grids)]
-                else:
-                    p1 = [cpu for grid_idx in range(self.param.n_grids)]
-                
-                self.profile_tt[pp.SOLVE].start()
-                ts_1 = TaskSpace("T")
-                for grid_idx in range(self.param.n_grids):
-                    @spawn(ts_1[grid_idx], placement=[p1[grid_idx]], dependencies=ts_0[grid_idx], vcus=0.0)
-                    def t1():
-                        f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, p1[grid_idx]))
-                        try:
-                            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                            self.ff[grid_idx]  = ff
-                            self.qoi[grid_idx] = qoi
-                        except:
-                            print("solver failed for v-space gird no %d"%(grid_idx))
-                            # self.qoi.append(None)
-                            # continue
-                            sys.exit(0)
-                            
-                await ts_1
-                self.profile_tt[pp.SOLVE].stop()
-        
-        
-        t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
-        t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
-        print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
-        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))        
-        if self.param.export_csv ==0 and self.param.plot_data==0:
-            return
-        
-        for grid_idx in range(self.param.n_grids):
-            dev_id = grid_idx % num_gpus
-            
-            if self.param.use_gpu==1:
-                gpu_id = cp.cuda.Device(dev_id)
-                gpu_id.use()
-            
-            ff       = self.ff[grid_idx]
-            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
-            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
-
-            if self.param.use_gpu==1:
-                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
-                
-                qoi = self.qoi[grid_idx]    
-                with cp.cuda.Device(dev_id):
-                    ff_r     = cp.asnumpy(ff_r)
-                    for k, v in qoi.items():
-                        qoi[k] = cp.asnumpy(v)
-                    
-            if csv_write==1:
-                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
-                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
-                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
-                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
-                
-                for col_idx, g in enumerate(self.param.collisions):
-                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
-
-            plot_data    = self.param.plot_data
-            if plot_data:
-                
-                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                
-                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-                eMag  = np.sqrt(eRe**2 + eIm**2)
-                
-                num_sh       = len(self.bte_solver._par_lm[grid_idx])
-                num_subplots = num_sh 
-                num_plt_cols = min(num_sh, 4)
-                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-                plt_idx      =  1
-                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
-
-                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
-                        fr = np.abs(ff_r[ii, lm_idx, :])
-                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
-                    
-                    plt.xlabel(r"energy (eV)")
-                    plt.ylabel(r"$f_%d$"%(lm[0]))
-                    plt.grid(visible=True)
-                    if lm_idx==0:
-                        plt.legend(prop={'size': 6})
-                        
-                    plt_idx +=1
-                
-                #plt_idx = num_sh
-                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
-                plt.close()
-        
-        if csv_write:
-            fname    = self.param.out_fname
-            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                writer = csv.writer(f,delimiter=',')
-                # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
-                
-                writer.writerow(header)
-                writer.writerows(data_csv)
-       
-    def push(self, interface):
-        xp                = self.xp_module
-        Te_bte            = xp.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        rate_bte          = xp.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
-        Te_tps            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
-        
-        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, self.tps_npts)
-        ni                = species_densities[TPSINDEX.ION_IDX]
-        n0                = species_densities[TPSINDEX.NEU_IDX]
-        ne                = species_densities[TPSINDEX.ELE_IDX]
-        
-        rate_tps_arr      = r_arr(Te_tps)
-        rate_tps_csc      = r_csc(Te_tps)
-        
-        rr_bte            = xp.zeros_like(rate_tps_arr) 
-        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
-        
-        for grid_idx in range(self.param.n_grids):
-            Te_bte[gidx_to_pidx_map[grid_idx]]        = (self.qoi[grid_idx]["energy"]/1.5) * self.param.ev_to_K
-            rr                                        = self.qoi[grid_idx]["rates"]
-            # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
-            rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
-        
-        rr_bte[rr_bte<0] = 0.0 
-        s0  = rate_tps_arr * n0 * ni
-        s1  = rate_tps_csc * n0 * ni
-        
-        s2  = rr_bte       * n0 * ni
-        
-        # tau = 1e-2
-        # idx = s2 > tau
-        rate_bte[0][:]   =  0.0
-        rate_bte[1][:]   =  0.0
-        rate_bte[0]      = rr_bte
-        rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
-        
-        return 
-        
-
-
-
 
 comm = MPI.COMM_WORLD
 # TPS solver
@@ -690,36 +42,26 @@ def push(self, interface):
 tps.chooseSolver()
 tps.initialize()
 
-boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
+boltzmann = BoltzmannMockSolver()
 
 interface = libtps.Tps2Boltzmann(tps)
 tps.initInterface(interface)
 
-coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
-print(coords.shape)
-
 it = 0
 max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
 print("Max Iters: ", max_iters)
 tps.solveBegin()
-tps.solveStep()
-tps.push(interface)
-boltzmann.grid_setup(interface)
-boltzmann.fetch(interface)
-boltzmann.solve()
-boltzmann.push(interface)
-tps.fetch(interface)
 
-# while it < max_iters:
-#     tps.solveStep()
-#     tps.push(interface)
-#     boltzmann.fetch(interface)
-#     boltzmann.solve()
-#     boltzmann.push(interface)
-#     tps.fetch(interface)
+while it < max_iters:
+    tps.solveStep()
+    tps.push(interface)
+    boltzmann.fetch(interface)
+    boltzmann.solve()
+    boltzmann.push(interface)
+    tps.fetch(interface)
     
-#     it = it+1
-#     print("it, ", it)
+    it = it+1
+    print("it, ", it)
 
 tps.solveEnd()
 
diff --git a/test/test_table.cpp b/test/test_table.cpp
index b5fc65ecb..3a8a95e2a 100644
--- a/test/test_table.cpp
+++ b/test/test_table.cpp
@@ -89,7 +89,7 @@ void testTableInterpolator1D(TPS::Tps &tps, int rank) {
     double xtest = refValues(k, 0);
     double fref = refValues(k, 1);
     double ftest[gpudata::MAXREACTIONS];
-    chem->computeForwardRateCoeffs(xtest, xtest, ftest);
+    chem->computeForwardRateCoeffs(xtest, xtest, k, ftest);
     double error = abs((fref - ftest[0]) / fref);
     if (error >= scalarErrorThreshold) {
       grvy_printf(GRVY_ERROR, "Rank %d - %.5E: %.5E\n", rank, xtest, abs((fref - ftest[0]) / fref));

From c337e6171dd793e6c670724a0c14ab5501fa6943 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 10:01:23 -0600
Subject: [PATCH 15/75] Add GridFunctionReaction

---
 src/reaction.cpp | 19 +++++++++++++++++++
 src/reaction.hpp | 15 +++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index 448d25515..1f41acc10 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -78,3 +78,22 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
   double temp = (isElectronInvolved) ? T_e : T_h;
   return table_->eval(temp);
 }
+
+MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(const mfem::GridFunction & f, int comp):
+#ifdef _GPU_
+data( f.Read() + comp*f.FESpace()->GetNDofs() )
+#else
+data( f.HostRead() + comp*f.FESpace()->GetNDofs() )
+#endif
+{
+  assert( f.Size() >= (comp+1)*f.FESpace()->GetNDofs() );
+}
+
+MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() { }
+
+MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unused]] const double &T_h, 
+                                                       [[maybe_unused]] const double &T_e,
+                                                       const int & dofindex,
+                                                       [[maybe_unused]] const bool isElectronInvolved) {
+  return data[dofindex];
+}
diff --git a/src/reaction.hpp b/src/reaction.hpp
index 4571c7161..3ff35487b 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -117,4 +117,19 @@ class Tabulated : public Reaction {
                                                          const bool isElectronInvolved = false);
 };
 
+class GridFunctionReaction : public Reaction {
+ private:
+  const double * data;
+
+ public:
+  MFEM_HOST_DEVICE GridFunctionReaction(const mfem::GridFunction & f, int comp);
+
+  MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
+
+  MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h, 
+                                                         [[maybe_unused]] const double &T_e,
+                                                         const int & dofindex,
+                                                         [[maybe_unused]] const bool isElectronInvolved = false);
+};
+
 #endif  // REACTION_HPP_

From 3ea95f1a3c5febd3f5baea0e2675db125f870b71 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 11:57:06 -0600
Subject: [PATCH 16/75] Preparing the data structures to support the Boltzmann
 integration

---
 src/chemistry.cpp      | 13 +++++++++++++
 src/chemistry.hpp      |  3 +++
 src/dataStructures.hpp |  3 ++-
 src/reaction.cpp       | 26 ++++++++++++++------------
 src/reaction.hpp       |  8 ++++++--
 5 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index a2900fc07..30e31d292 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -86,6 +86,9 @@ MFEM_HOST_DEVICE Chemistry::Chemistry(GasMixture *mixture, const ChemistryInput
       case TABULATED_RXN: {
         reactions_[r] = new Tabulated(inputs.reactionInputs[r].tableInput);
       } break;
+      case GRIDFUNCTION_RXN: {
+        reactions_[r] = new GridFunctionReaction(inputs.reactionInputs[r].indexInput);
+      } break;
       default:
         printf("Unknown reactionModel.");
         assert(false);
@@ -106,6 +109,16 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
   }
 }
 
+void Chemistry::setGridFunctionRates(const mfem::GridFunction & f)
+{
+  for (int r = 0; r < numReactions_; r++) {
+    if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
+      GridFunctionReaction * rx = dynamic_cast<GridFunctionReaction*>(reactions_[r]);
+      rx->setGridFunctionData(f);
+    }
+  }
+}
+
 #if 0 
 void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd) {
   kfwd.SetSize(numReactions_);
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 653b5fe77..6b0ce6e46 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -95,6 +95,9 @@ class Chemistry {
 
   MFEM_HOST_DEVICE ~Chemistry();
 
+  // Set the grid function rates for GRIDFUNCTION_RXN reaction types
+  void setGridFunctionRates(const mfem::GridFunction & f);
+
   // return Vector of reaction rate coefficients, with the size of numReaction_.
   // WARNING(marc) I have removed "virtual" qualifier here assuming these functions will not
   // change for child classes. Correct if wrong
diff --git a/src/dataStructures.hpp b/src/dataStructures.hpp
index c3a7a6825..d881abe86 100644
--- a/src/dataStructures.hpp
+++ b/src/dataStructures.hpp
@@ -74,7 +74,7 @@ enum TransportModel { ARGON_MINIMAL, ARGON_MIXTURE, CONSTANT, LTE_TRANSPORT, MIX
 
 enum ChemistryModel { /* CANTERA, */ NUM_CHEMISTRYMODEL };
 
-enum ReactionModel { ARRHENIUS, HOFFERTLIEN, TABULATED_RXN, NUM_REACTIONMODEL };
+enum ReactionModel { ARRHENIUS, HOFFERTLIEN, TABULATED_RXN, GRIDFUNCTION_RXN, NUM_REACTIONMODEL };
 
 enum RadiationModel { NONE_RAD, NET_EMISSION, NUM_RADIATIONMODEL };
 
@@ -623,6 +623,7 @@ struct ReactionInput {
   TableInput tableInput;
   // NOTE(kevin): with gpu, this pointer is only valid on the device.
   const double *modelParams;
+  int indexInput;
 };
 
 struct ChemistryInput {
diff --git a/src/reaction.cpp b/src/reaction.cpp
index 1f41acc10..5720f0966 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -36,7 +36,7 @@ using namespace mfem;
 using namespace std;
 
 MFEM_HOST_DEVICE Arrhenius::Arrhenius(const double &A, const double &b, const double &E)
-    : Reaction(), A_(A), b_(b), E_(E) {}
+    : Reaction(ARRHENIUS), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double Arrhenius::computeRateCoefficient(const double &T_h, const double &T_e,
                                                           [[maybe_unused]] const int & dofindex,
@@ -47,7 +47,7 @@ MFEM_HOST_DEVICE double Arrhenius::computeRateCoefficient(const double &T_h, con
 }
 
 MFEM_HOST_DEVICE HoffertLien::HoffertLien(const double &A, const double &b, const double &E)
-    : Reaction(), A_(A), b_(b), E_(E) {}
+    : Reaction(HOFFERTLIEN), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double HoffertLien::computeRateCoefficient(const double &T_h, const double &T_e,
                                                             [[maybe_unused]] const int & dofindex,
@@ -58,7 +58,7 @@ MFEM_HOST_DEVICE double HoffertLien::computeRateCoefficient(const double &T_h, c
   return A_ * pow(temp, b_) * (tempFactor + 2.0) * exp(-tempFactor);
 }
 
-MFEM_HOST_DEVICE Tabulated::Tabulated(const TableInput &input) : Reaction() {
+MFEM_HOST_DEVICE Tabulated::Tabulated(const TableInput &input) : Reaction(TABULATED_RXN) {
   switch (input.order) {
     case 1: {
       table_ = new LinearTable(input);
@@ -79,18 +79,20 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
   return table_->eval(temp);
 }
 
-MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(const mfem::GridFunction & f, int comp):
-#ifdef _GPU_
-data( f.Read() + comp*f.FESpace()->GetNDofs() )
-#else
-data( f.HostRead() + comp*f.FESpace()->GetNDofs() )
-#endif
-{
-  assert( f.Size() >= (comp+1)*f.FESpace()->GetNDofs() );
-}
+MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp):
+Reaction(GRIDFUNCTION_RXN), data( nullptr ), comp(comp) { }
 
 MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() { }
 
+void GridFunctionReaction::setGridFunctionData(const mfem::GridFunction & f) {
+  assert( f.Size() >= (comp+1)*f.FESpace()->GetNDofs() );
+  #ifdef _GPU_
+  data = f.Read() + comp*f.FESpace()->GetNDofs();
+  #else
+  data = f.HostRead() + comp*f.FESpace()->GetNDofs();
+  #endif
+}
+
 MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unused]] const double &T_h, 
                                                        [[maybe_unused]] const double &T_e,
                                                        const int & dofindex,
diff --git a/src/reaction.hpp b/src/reaction.hpp
index 3ff35487b..aae3b5789 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -53,7 +53,8 @@ using namespace std;
 class Reaction {
  protected:
  public:
-  MFEM_HOST_DEVICE Reaction() {}
+  const ReactionModel reactionModel;
+  MFEM_HOST_DEVICE Reaction(ReactionModel rm): reactionModel(rm) {}
 
   MFEM_HOST_DEVICE virtual ~Reaction() {}
 
@@ -120,12 +121,15 @@ class Tabulated : public Reaction {
 class GridFunctionReaction : public Reaction {
  private:
   const double * data;
+  const int comp;
 
  public:
-  MFEM_HOST_DEVICE GridFunctionReaction(const mfem::GridFunction & f, int comp);
+  MFEM_HOST_DEVICE GridFunctionReaction(int comp);
 
   MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
 
+  void setGridFunctionData(const mfem::GridFunction & f);
+
   MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h, 
                                                          [[maybe_unused]] const double &T_e,
                                                          const int & dofindex,

From 53002e6abe856482945a6be534320dcc54589418 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 12:00:23 -0600
Subject: [PATCH 17/75] make enforcestyle

---
 src/M2ulPhyS2Boltzmann.cpp       | 20 +++++++------------
 src/chemistry.cpp                | 10 +++++-----
 src/chemistry.hpp                |  9 +++++----
 src/cycle_avg_joule_coupling.cpp |  1 -
 src/reaction.cpp                 | 34 ++++++++++++++++----------------
 src/reaction.hpp                 | 19 +++++++++---------
 src/tps2Boltzmann.cpp            | 16 ++++++++-------
 src/tps2Boltzmann.hpp            |  4 ++--
 8 files changed, 54 insertions(+), 59 deletions(-)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index e3d60bf20..6d150d3e2 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -83,22 +83,16 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
   delete electronTemperature;
 }
 
-void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) { 
-
+void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) {
   mfem::ParaViewDataCollection paraview_dc("interface", mesh);
   paraview_dc.SetPrefixPath("BoltzmannInterface");
   paraview_dc.SetCycle(0);
   paraview_dc.SetDataFormat(VTKFormat::BINARY);
   paraview_dc.SetTime(0.0);
-  paraview_dc.RegisterField("Heavy temperature",
-                            &interface.Field(TPS::Tps2Boltzmann::Index::HeavyTemperature));
-  paraview_dc.RegisterField("Electron temperature",
-                            &interface.Field(TPS::Tps2Boltzmann::Index::ElectronTemperature));
-  paraview_dc.RegisterField("Electric field",
-                            &interface.Field(TPS::Tps2Boltzmann::Index::ElectricField));
-  paraview_dc.RegisterField("Species",
-                            &interface.Field(TPS::Tps2Boltzmann::Index::SpeciesDensities));
-  paraview_dc.RegisterField("Reaction rates",
-                             &interface.Field(TPS::Tps2Boltzmann::Index::ReactionRates));
+  paraview_dc.RegisterField("Heavy temperature", &interface.Field(TPS::Tps2Boltzmann::Index::HeavyTemperature));
+  paraview_dc.RegisterField("Electron temperature", &interface.Field(TPS::Tps2Boltzmann::Index::ElectronTemperature));
+  paraview_dc.RegisterField("Electric field", &interface.Field(TPS::Tps2Boltzmann::Index::ElectricField));
+  paraview_dc.RegisterField("Species", &interface.Field(TPS::Tps2Boltzmann::Index::SpeciesDensities));
+  paraview_dc.RegisterField("Reaction rates", &interface.Field(TPS::Tps2Boltzmann::Index::ReactionRates));
   paraview_dc.Save();
- }
+}
diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index 30e31d292..45eab9c9c 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -109,11 +109,10 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
   }
 }
 
-void Chemistry::setGridFunctionRates(const mfem::GridFunction & f)
-{
+void Chemistry::setGridFunctionRates(const mfem::GridFunction &f) {
   for (int r = 0; r < numReactions_; r++) {
     if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
-      GridFunctionReaction * rx = dynamic_cast<GridFunctionReaction*>(reactions_[r]);
+      GridFunctionReaction *rx = dynamic_cast<GridFunctionReaction *>(reactions_[r]);
       rx->setGridFunctionData(f);
     }
   }
@@ -138,7 +137,8 @@ void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, V
 }
 #endif
 
-MFEM_HOST_DEVICE void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, const int & dofindex, double *kfwd) {
+MFEM_HOST_DEVICE void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, const int &dofindex,
+                                                          double *kfwd) {
   // kfwd.SetSize(numReactions_);
   for (int r = 0; r < numReactions_; r++) kfwd[r] = 0.0;
 
@@ -147,7 +147,7 @@ MFEM_HOST_DEVICE void Chemistry::computeForwardRateCoeffs(const double &T_h, con
 
   for (int r = 0; r < numReactions_; r++) {
     bool isElectronInvolved = isElectronInvolvedAt(r);
-    kfwd[r] = reactions_[r]->computeRateCoefficient(Thlim , Telim, dofindex, isElectronInvolved);
+    kfwd[r] = reactions_[r]->computeRateCoefficient(Thlim, Telim, dofindex, isElectronInvolved);
   }
 
   return;
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 6b0ce6e46..a80e5fb2b 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -96,15 +96,16 @@ class Chemistry {
   MFEM_HOST_DEVICE ~Chemistry();
 
   // Set the grid function rates for GRIDFUNCTION_RXN reaction types
-  void setGridFunctionRates(const mfem::GridFunction & f);
+  void setGridFunctionRates(const mfem::GridFunction &f);
 
   // return Vector of reaction rate coefficients, with the size of numReaction_.
   // WARNING(marc) I have removed "virtual" qualifier here assuming these functions will not
   // change for child classes. Correct if wrong
-  //void computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd);
-  MFEM_HOST_DEVICE void computeForwardRateCoeffs(const double &T_h, const double &T_e, const int & dofindex, double *kfwd);
+  // void computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd);
+  MFEM_HOST_DEVICE void computeForwardRateCoeffs(const double &T_h, const double &T_e, const int &dofindex,
+                                                 double *kfwd);
 
-  //void computeEquilibriumConstants(const double &T_h, const double &T_e, Vector &kC);
+  // void computeEquilibriumConstants(const double &T_h, const double &T_e, Vector &kC);
   MFEM_HOST_DEVICE void computeEquilibriumConstants(const double &T_h, const double &T_e, double *kC);
 
   // return rate coefficients of (reactionIndex)-th reaction. (start from 0)
diff --git a/src/cycle_avg_joule_coupling.cpp b/src/cycle_avg_joule_coupling.cpp
index f5f5c3a5c..acf78235b 100644
--- a/src/cycle_avg_joule_coupling.cpp
+++ b/src/cycle_avg_joule_coupling.cpp
@@ -313,7 +313,6 @@ void CycleAvgJouleCoupling::interpElectricFieldFromEMToFlow() {
   efieldR_->SetFromTrueDofs(interp_vals);
   efieldR_->HostRead();
 
-
   const ParGridFunction *efield_imag_gf = qmsa_solver_->getElectricFieldimag();
   interp_em_to_flow_->Interpolate(vxyz, *efield_imag_gf, interp_vals);
   efieldI_->SetFromTrueDofs(interp_vals);
diff --git a/src/reaction.cpp b/src/reaction.cpp
index 5720f0966..e2a5c952d 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -39,7 +39,7 @@ MFEM_HOST_DEVICE Arrhenius::Arrhenius(const double &A, const double &b, const do
     : Reaction(ARRHENIUS), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double Arrhenius::computeRateCoefficient(const double &T_h, const double &T_e,
-                                                          [[maybe_unused]] const int & dofindex,
+                                                          [[maybe_unused]] const int &dofindex,
                                                           const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
 
@@ -50,7 +50,7 @@ MFEM_HOST_DEVICE HoffertLien::HoffertLien(const double &A, const double &b, cons
     : Reaction(HOFFERTLIEN), A_(A), b_(b), E_(E) {}
 
 MFEM_HOST_DEVICE double HoffertLien::computeRateCoefficient(const double &T_h, const double &T_e,
-                                                            [[maybe_unused]] const int & dofindex,
+                                                            [[maybe_unused]] const int &dofindex,
                                                             const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
   double tempFactor = E_ / BOLTZMANNCONSTANT / temp;
@@ -73,29 +73,29 @@ MFEM_HOST_DEVICE Tabulated::Tabulated(const TableInput &input) : Reaction(TABULA
 MFEM_HOST_DEVICE Tabulated::~Tabulated() { delete table_; }
 
 MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, const double &T_e,
-                                                          [[maybe_unused]] const int & dofindex,
+                                                          [[maybe_unused]] const int &dofindex,
                                                           const bool isElectronInvolved) {
   double temp = (isElectronInvolved) ? T_e : T_h;
   return table_->eval(temp);
 }
 
-MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp):
-Reaction(GRIDFUNCTION_RXN), data( nullptr ), comp(comp) { }
+MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
+    : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp) {}
 
-MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() { }
+MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 
-void GridFunctionReaction::setGridFunctionData(const mfem::GridFunction & f) {
-  assert( f.Size() >= (comp+1)*f.FESpace()->GetNDofs() );
-  #ifdef _GPU_
-  data = f.Read() + comp*f.FESpace()->GetNDofs();
-  #else
-  data = f.HostRead() + comp*f.FESpace()->GetNDofs();
-  #endif
+void GridFunctionReaction::setGridFunctionData(const mfem::GridFunction &f) {
+  assert(f.Size() >= (comp + 1) * f.FESpace()->GetNDofs());
+#ifdef _GPU_
+  data = f.Read() + comp * f.FESpace()->GetNDofs();
+#else
+  data = f.HostRead() + comp * f.FESpace()->GetNDofs();
+#endif
 }
 
-MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unused]] const double &T_h, 
-                                                       [[maybe_unused]] const double &T_e,
-                                                       const int & dofindex,
-                                                       [[maybe_unused]] const bool isElectronInvolved) {
+MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unused]] const double &T_h,
+                                                                     [[maybe_unused]] const double &T_e,
+                                                                     const int &dofindex,
+                                                                     [[maybe_unused]] const bool isElectronInvolved) {
   return data[dofindex];
 }
diff --git a/src/reaction.hpp b/src/reaction.hpp
index aae3b5789..b8c666219 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -54,12 +54,12 @@ class Reaction {
  protected:
  public:
   const ReactionModel reactionModel;
-  MFEM_HOST_DEVICE Reaction(ReactionModel rm): reactionModel(rm) {}
+  MFEM_HOST_DEVICE Reaction(ReactionModel rm) : reactionModel(rm) {}
 
   MFEM_HOST_DEVICE virtual ~Reaction() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
-                                                         [[maybe_unused]] const int & dofindex,
+                                                         [[maybe_unused]] const int &dofindex,
                                                          const bool isElectronInvolved = false) {
     printf("computeRateCoefficient not implemented");
     return 0;
@@ -79,7 +79,7 @@ class Arrhenius : public Reaction {
   MFEM_HOST_DEVICE virtual ~Arrhenius() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
-                                                         [[maybe_unused]] const int & dofindex,
+                                                         [[maybe_unused]] const int &dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
@@ -100,7 +100,7 @@ class HoffertLien : public Reaction {
   MFEM_HOST_DEVICE virtual ~HoffertLien() {}
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
-                                                         [[maybe_unused]] const int & dofindex,
+                                                         [[maybe_unused]] const int &dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
@@ -114,13 +114,13 @@ class Tabulated : public Reaction {
   MFEM_HOST_DEVICE virtual ~Tabulated();
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient(const double &T_h, const double &T_e,
-                                                         [[maybe_unused]] const int & dofindex,
+                                                         [[maybe_unused]] const int &dofindex,
                                                          const bool isElectronInvolved = false);
 };
 
 class GridFunctionReaction : public Reaction {
  private:
-  const double * data;
+  const double *data;
   const int comp;
 
  public:
@@ -128,11 +128,10 @@ class GridFunctionReaction : public Reaction {
 
   MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
 
-  void setGridFunctionData(const mfem::GridFunction & f);
+  void setGridFunctionData(const mfem::GridFunction &f);
 
-  MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h, 
-                                                         [[maybe_unused]] const double &T_e,
-                                                         const int & dofindex,
+  MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h,
+                                                         [[maybe_unused]] const double &T_e, const int &dofindex,
                                                          [[maybe_unused]] const bool isElectronInvolved = false);
 };
 
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index a6b5800f7..8cb35d0c8 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -41,9 +41,10 @@
 #include <pybind11/stl.h>
 #endif
 
+#include <math.h>
+
 #include <cstddef>
 #include <cstdlib>
-#include <math.h>
 
 namespace TPS {
 
@@ -73,8 +74,8 @@ class CPUData {
   size_t stride_;
 };
 
-void idenity_fun(const Vector & x, Vector & out) {
-  for ( int i(0); i < x.Size(); ++i ) out[i] = x[i];
+void idenity_fun(const Vector &x, Vector &out) {
+  for (int i(0); i < x.Size(); ++i) out[i] = x[i];
 }
 
 Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : NIndexes(7), tps_(tps), all_fes_(nullptr) {
@@ -89,7 +90,7 @@ Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : NIndexes(7), tps_(tps), all_fes_(nullpt
   assert(basis_type_ == 0 || basis_type_ == 1);
 
   tps->getRequiredInput("em/current_frequency", EfieldAngularFreq_);
-  EfieldAngularFreq_ *= 2.*M_PI;
+  EfieldAngularFreq_ *= 2. * M_PI;
 
   offsets.SetSize(NIndexes + 1);
   ncomps.SetSize(NIndexes + 1);
@@ -277,9 +278,10 @@ void tps2bolzmann(py::module &m) {
            [](TPS::Tps2Boltzmann &interface, TPS::Tps2Boltzmann::Index index) {
              return std::unique_ptr<TPS::CPUData>(new TPS::CPUData(interface.Field(index), false));
            })
-      .def("HostReadWrite", [](TPS::Tps2Boltzmann &interface, TPS::Tps2Boltzmann::Index index) {
-        return std::unique_ptr<TPS::CPUData>(new TPS::CPUData(interface.Field(index), true));
-      })
+      .def("HostReadWrite",
+           [](TPS::Tps2Boltzmann &interface, TPS::Tps2Boltzmann::Index index) {
+             return std::unique_ptr<TPS::CPUData>(new TPS::CPUData(interface.Field(index), true));
+           })
       .def("EfieldAngularFreq", &TPS::Tps2Boltzmann::EfieldAngularFreq)
       .def("Nspecies", &TPS::Tps2Boltzmann::Nspecies)
       .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index 4895c523b..91cbd1a54 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -99,8 +99,8 @@ class Tps2Boltzmann {
   const mfem::ParFiniteElementSpace &NativeFes(Index index) const { return *(list_native_fes_[index]); }
   mfem::ParFiniteElementSpace &NativeFes(Index index) { return *(list_native_fes_[index]); }
 
-  const mfem::ParGridFunction & SpatialCoordinates() const { return *spatial_coordinates_; }
-  mfem::ParGridFunction & SpatialCoordinates() { return *spatial_coordinates_; }
+  const mfem::ParGridFunction &SpatialCoordinates() const { return *spatial_coordinates_; }
+  mfem::ParGridFunction &SpatialCoordinates() { return *spatial_coordinates_; }
 
   const mfem::ParGridFunction &Field(Index index) const { return *(fields_[index]); }
   mfem::ParGridFunction &Field(Index index) { return *(fields_[index]); }

From 248aa45b952eaf3277a0a175353d975c8bfdcab8 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 12:02:12 -0600
Subject: [PATCH 18/75] make style

---
 src/chemistry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index 45eab9c9c..10fa0190b 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -118,7 +118,7 @@ void Chemistry::setGridFunctionRates(const mfem::GridFunction &f) {
   }
 }
 
-#if 0 
+#if 0
 void Chemistry::computeForwardRateCoeffs(const double &T_h, const double &T_e, Vector &kfwd) {
   kfwd.SetSize(numReactions_);
 

From e45626218c2a85b3f8db1ee83467bf405932e759 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 23:03:50 -0600
Subject: [PATCH 19/75] Complete interface

---
 src/M2ulPhyS2Boltzmann.cpp | 15 +++--------
 src/chemistry.cpp          |  2 +-
 src/chemistry.hpp          |  2 +-
 src/reaction.cpp           |  9 ++++---
 src/reaction.hpp           |  3 ++-
 src/tps2Boltzmann.cpp      | 55 ++++++++++++++++++++++++++++++++++++--
 src/tps2Boltzmann.hpp      |  7 +++++
 7 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index 6d150d3e2..bf1a55801 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -84,15 +84,8 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
 }
 
 void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) {
-  mfem::ParaViewDataCollection paraview_dc("interface", mesh);
-  paraview_dc.SetPrefixPath("BoltzmannInterface");
-  paraview_dc.SetCycle(0);
-  paraview_dc.SetDataFormat(VTKFormat::BINARY);
-  paraview_dc.SetTime(0.0);
-  paraview_dc.RegisterField("Heavy temperature", &interface.Field(TPS::Tps2Boltzmann::Index::HeavyTemperature));
-  paraview_dc.RegisterField("Electron temperature", &interface.Field(TPS::Tps2Boltzmann::Index::ElectronTemperature));
-  paraview_dc.RegisterField("Electric field", &interface.Field(TPS::Tps2Boltzmann::Index::ElectricField));
-  paraview_dc.RegisterField("Species", &interface.Field(TPS::Tps2Boltzmann::Index::SpeciesDensities));
-  paraview_dc.RegisterField("Reaction rates", &interface.Field(TPS::Tps2Boltzmann::Index::ReactionRates));
-  paraview_dc.Save();
+  mfem::ParFiniteElementSpace * reaction_rates_fes(&(interface.NativeFes(TPS::Tps2Boltzmann::Index::ReactionRates)));
+  std::shared_ptr<mfem::ParGridFunction> reaction_rates(new mfem::ParGridFunction( reaction_rates_fes ) );
+  interface.interpolateToNativeFES(*reaction_rates, TPS::Tps2Boltzmann::Index::ReactionRates);
+  chemistry_->setGridFunctionRates(reaction_rates);
 }
diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index 10fa0190b..f233d2ed5 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -109,7 +109,7 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
   }
 }
 
-void Chemistry::setGridFunctionRates(const mfem::GridFunction &f) {
+void Chemistry::setGridFunctionRates(std::shared_ptr<mfem::ParGridFunction> &f) {
   for (int r = 0; r < numReactions_; r++) {
     if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
       GridFunctionReaction *rx = dynamic_cast<GridFunctionReaction *>(reactions_[r]);
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index a80e5fb2b..0463f0f22 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -96,7 +96,7 @@ class Chemistry {
   MFEM_HOST_DEVICE ~Chemistry();
 
   // Set the grid function rates for GRIDFUNCTION_RXN reaction types
-  void setGridFunctionRates(const mfem::GridFunction &f);
+  void setGridFunctionRates(std::shared_ptr<mfem::ParGridFunction> &f);
 
   // return Vector of reaction rate coefficients, with the size of numReaction_.
   // WARNING(marc) I have removed "virtual" qualifier here assuming these functions will not
diff --git a/src/reaction.cpp b/src/reaction.cpp
index e2a5c952d..a8a984ef8 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -84,12 +84,13 @@ MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
 
 MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 
-void GridFunctionReaction::setGridFunctionData(const mfem::GridFunction &f) {
-  assert(f.Size() >= (comp + 1) * f.FESpace()->GetNDofs());
+void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
+  f_ = f;
+  assert(f->Size() >= (comp + 1) * f->FESpace()->GetNDofs());
 #ifdef _GPU_
-  data = f.Read() + comp * f.FESpace()->GetNDofs();
+  data = f_->Read() + comp * f_->FESpace()->GetNDofs();
 #else
-  data = f.HostRead() + comp * f.FESpace()->GetNDofs();
+  data = f_->HostRead() + comp * f_->FESpace()->GetNDofs();
 #endif
 }
 
diff --git a/src/reaction.hpp b/src/reaction.hpp
index b8c666219..daf0406ed 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -120,6 +120,7 @@ class Tabulated : public Reaction {
 
 class GridFunctionReaction : public Reaction {
  private:
+  std::shared_ptr<mfem::GridFunction> f_;
   const double *data;
   const int comp;
 
@@ -128,7 +129,7 @@ class GridFunctionReaction : public Reaction {
 
   MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
 
-  void setGridFunctionData(const mfem::GridFunction &f);
+  void setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f);
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h,
                                                          [[maybe_unused]] const double &T_e, const int &dofindex,
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 8cb35d0c8..ea554f571 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -78,7 +78,12 @@ void idenity_fun(const Vector &x, Vector &out) {
   for (int i(0); i < x.Size(); ++i) out[i] = x[i];
 }
 
-Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : NIndexes(7), tps_(tps), all_fes_(nullptr) {
+Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : 
+  NIndexes(7),
+  tps_(tps),
+  all_fes_(nullptr),
+  save_to_paraview_dc(false),
+  paraview_dc(nullptr) {
   // Assert we have a couple solver;
   assert(tps->isFlowEMCoupled());
 
@@ -92,6 +97,8 @@ Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : NIndexes(7), tps_(tps), all_fes_(nullpt
   tps->getRequiredInput("em/current_frequency", EfieldAngularFreq_);
   EfieldAngularFreq_ *= 2. * M_PI;
 
+  tps->getInput("boltzmannInterface/save_to_paraview", save_to_paraview_dc);
+
   offsets.SetSize(NIndexes + 1);
   ncomps.SetSize(NIndexes + 1);
 }
@@ -170,12 +177,28 @@ void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
   scalar_interpolator_->SetAssemblyLevel(assembly_level);
   scalar_interpolator_->Assemble();
 
+  scalar_interpolator_to_nativeFES_= new mfem::ParDiscreteLinearOperator(scalar_fes_, scalar_native_fes_);
+  scalar_interpolator_to_nativeFES_->AddDomainInterpolator(new mfem::IdentityInterpolator());
+  scalar_interpolator_to_nativeFES_->SetAssemblyLevel(assembly_level);
+  scalar_interpolator_to_nativeFES_->Assemble();
+
   // Spatial coordinates
   spatial_coord_fes_ = new mfem::ParFiniteElementSpace(pmesh, fec_, pmesh->Dimension(), mfem::Ordering::byNODES);
   spatial_coordinates_ = new mfem::ParGridFunction(spatial_coord_fes_);
   mfem::VectorFunctionCoefficient coord_fun(pmesh->Dimension(),
                                             std::function<void(const Vector &, Vector &)>(idenity_fun));
   spatial_coordinates_->ProjectCoefficient(coord_fun);
+
+  if(save_to_paraview_dc) {
+      paraview_dc = new mfem::ParaViewDataCollection("interface", pmesh);
+      paraview_dc->SetPrefixPath("BoltzmannInterface");
+      paraview_dc->SetDataFormat(VTKFormat::BINARY);
+      paraview_dc->RegisterField("Heavy temperature", &(this->Field(TPS::Tps2Boltzmann::Index::HeavyTemperature)));
+      paraview_dc->RegisterField("Electron temperature", &(this->Field(TPS::Tps2Boltzmann::Index::ElectronTemperature)));
+      paraview_dc->RegisterField("Electric field", &(this->Field(TPS::Tps2Boltzmann::Index::ElectricField)));
+      paraview_dc->RegisterField("Species", &(this->Field(TPS::Tps2Boltzmann::Index::SpeciesDensities)));
+      paraview_dc->RegisterField("Reaction rates", &(this->Field(TPS::Tps2Boltzmann::Index::ReactionRates)));
+  }
 }
 
 void Tps2Boltzmann::interpolateFromNativeFES(const ParGridFunction &input, Tps2Boltzmann::Index index) {
@@ -193,6 +216,29 @@ void Tps2Boltzmann::interpolateFromNativeFES(const ParGridFunction &input, Tps2B
   }
 }
 
+void Tps2Boltzmann::interpolateToNativeFES(ParGridFunction &output, Index index) {
+  if (ncomps[index] == 1) {
+    scalar_interpolator_to_nativeFES_->Mult(*(fields_[index]), output);
+  } else {
+    const int loc_size_native = list_native_fes_[index]->GetNDofs();
+    const int loc_size = list_fes_[index]->GetNDofs();
+    for (int icomp(0); icomp < ncomps[index]; ++icomp) {
+      mfem::Vector view_output(output, icomp * loc_size_native,
+                                    loc_size_native);
+      mfem::Vector view_field(*(fields_[index]), icomp * loc_size, loc_size);
+      scalar_interpolator_to_nativeFES_->Mult(view_field, view_output);
+    }
+  }
+}
+
+void Tps2Boltzmann::saveDataCollection(int cycle, double time) {
+  if ( paraview_dc) {
+    paraview_dc->SetCycle(cycle);
+    paraview_dc->SetTime(time);
+    paraview_dc->Save();
+  }
+}
+
 Tps2Boltzmann::~Tps2Boltzmann() {
   // Delete views
   for (std::size_t i(0); i < NIndexes + 1; ++i) delete fields_[i];
@@ -200,6 +246,7 @@ Tps2Boltzmann::~Tps2Boltzmann() {
   delete[] fields_;
 
   // Delete interpolators
+  delete scalar_interpolator_to_nativeFES_;
   delete scalar_interpolator_;
 
   // Delete view Native Finite Element Spaces
@@ -285,7 +332,11 @@ void tps2bolzmann(py::module &m) {
       .def("EfieldAngularFreq", &TPS::Tps2Boltzmann::EfieldAngularFreq)
       .def("Nspecies", &TPS::Tps2Boltzmann::Nspecies)
       .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
-      .def("nComponents", &TPS::Tps2Boltzmann::nComponents);
+      .def("nComponents", &TPS::Tps2Boltzmann::nComponents)
+      .def("saveDataCollection",
+           &TPS::Tps2Boltzmann::saveDataCollection,
+           "Save the data collection in Paraview format",
+           py::arg("cycle"), py::arg("time") );
 }
 }  // namespace tps_wrappers
 #endif
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index 91cbd1a54..c4138b755 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -106,6 +106,7 @@ class Tps2Boltzmann {
   mfem::ParGridFunction &Field(Index index) { return *(fields_[index]); }
 
   void interpolateFromNativeFES(const ParGridFunction &input, Index index);
+  void interpolateToNativeFES(ParGridFunction &output, Index index);
 
   //! Get the angular Frequency \omega of the electrical field:
   //! E(t) = Er*cos(\omega t) + Ei*sin(\omega t)
@@ -114,6 +115,8 @@ class Tps2Boltzmann {
   int NeFieldComps() const { return nEfieldComps_; }
   int nComponents(Index index) const { return ncomps[index]; }
 
+  void saveDataCollection(int cycle, double time);
+
   ~Tps2Boltzmann();
 
  private:
@@ -149,12 +152,16 @@ class Tps2Boltzmann {
 
   //! Linear interpolator between native TPS fec to Interface fec
   mfem::ParDiscreteLinearOperator *scalar_interpolator_;
+  mfem::ParDiscreteLinearOperator *scalar_interpolator_to_nativeFES_;
 
   //! array of fields see *Index for how to address this
   mfem::ParGridFunction **fields_;
   mfem::ParGridFunction *spatial_coordinates_;
 
   double EfieldAngularFreq_;
+
+  bool save_to_paraview_dc;
+  mfem::ParaViewDataCollection * paraview_dc;
 };
 }  // namespace TPS
 

From baddac5de2835fc578fe5c274b3fc8d9e1ca6479 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 5 Jan 2024 23:04:58 -0600
Subject: [PATCH 20/75] make enforcestyle

---
 src/M2ulPhyS2Boltzmann.cpp |  4 ++--
 src/tps2Boltzmann.cpp      | 39 ++++++++++++++++----------------------
 src/tps2Boltzmann.hpp      |  2 +-
 3 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index bf1a55801..4fa594e5e 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -84,8 +84,8 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
 }
 
 void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) {
-  mfem::ParFiniteElementSpace * reaction_rates_fes(&(interface.NativeFes(TPS::Tps2Boltzmann::Index::ReactionRates)));
-  std::shared_ptr<mfem::ParGridFunction> reaction_rates(new mfem::ParGridFunction( reaction_rates_fes ) );
+  mfem::ParFiniteElementSpace *reaction_rates_fes(&(interface.NativeFes(TPS::Tps2Boltzmann::Index::ReactionRates)));
+  std::shared_ptr<mfem::ParGridFunction> reaction_rates(new mfem::ParGridFunction(reaction_rates_fes));
   interface.interpolateToNativeFES(*reaction_rates, TPS::Tps2Boltzmann::Index::ReactionRates);
   chemistry_->setGridFunctionRates(reaction_rates);
 }
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index ea554f571..504c4560f 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -78,12 +78,8 @@ void idenity_fun(const Vector &x, Vector &out) {
   for (int i(0); i < x.Size(); ++i) out[i] = x[i];
 }
 
-Tps2Boltzmann::Tps2Boltzmann(Tps *tps) : 
-  NIndexes(7),
-  tps_(tps),
-  all_fes_(nullptr),
-  save_to_paraview_dc(false),
-  paraview_dc(nullptr) {
+Tps2Boltzmann::Tps2Boltzmann(Tps *tps)
+    : NIndexes(7), tps_(tps), all_fes_(nullptr), save_to_paraview_dc(false), paraview_dc(nullptr) {
   // Assert we have a couple solver;
   assert(tps->isFlowEMCoupled());
 
@@ -177,7 +173,7 @@ void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
   scalar_interpolator_->SetAssemblyLevel(assembly_level);
   scalar_interpolator_->Assemble();
 
-  scalar_interpolator_to_nativeFES_= new mfem::ParDiscreteLinearOperator(scalar_fes_, scalar_native_fes_);
+  scalar_interpolator_to_nativeFES_ = new mfem::ParDiscreteLinearOperator(scalar_fes_, scalar_native_fes_);
   scalar_interpolator_to_nativeFES_->AddDomainInterpolator(new mfem::IdentityInterpolator());
   scalar_interpolator_to_nativeFES_->SetAssemblyLevel(assembly_level);
   scalar_interpolator_to_nativeFES_->Assemble();
@@ -189,15 +185,15 @@ void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
                                             std::function<void(const Vector &, Vector &)>(idenity_fun));
   spatial_coordinates_->ProjectCoefficient(coord_fun);
 
-  if(save_to_paraview_dc) {
-      paraview_dc = new mfem::ParaViewDataCollection("interface", pmesh);
-      paraview_dc->SetPrefixPath("BoltzmannInterface");
-      paraview_dc->SetDataFormat(VTKFormat::BINARY);
-      paraview_dc->RegisterField("Heavy temperature", &(this->Field(TPS::Tps2Boltzmann::Index::HeavyTemperature)));
-      paraview_dc->RegisterField("Electron temperature", &(this->Field(TPS::Tps2Boltzmann::Index::ElectronTemperature)));
-      paraview_dc->RegisterField("Electric field", &(this->Field(TPS::Tps2Boltzmann::Index::ElectricField)));
-      paraview_dc->RegisterField("Species", &(this->Field(TPS::Tps2Boltzmann::Index::SpeciesDensities)));
-      paraview_dc->RegisterField("Reaction rates", &(this->Field(TPS::Tps2Boltzmann::Index::ReactionRates)));
+  if (save_to_paraview_dc) {
+    paraview_dc = new mfem::ParaViewDataCollection("interface", pmesh);
+    paraview_dc->SetPrefixPath("BoltzmannInterface");
+    paraview_dc->SetDataFormat(VTKFormat::BINARY);
+    paraview_dc->RegisterField("Heavy temperature", &(this->Field(TPS::Tps2Boltzmann::Index::HeavyTemperature)));
+    paraview_dc->RegisterField("Electron temperature", &(this->Field(TPS::Tps2Boltzmann::Index::ElectronTemperature)));
+    paraview_dc->RegisterField("Electric field", &(this->Field(TPS::Tps2Boltzmann::Index::ElectricField)));
+    paraview_dc->RegisterField("Species", &(this->Field(TPS::Tps2Boltzmann::Index::SpeciesDensities)));
+    paraview_dc->RegisterField("Reaction rates", &(this->Field(TPS::Tps2Boltzmann::Index::ReactionRates)));
   }
 }
 
@@ -223,8 +219,7 @@ void Tps2Boltzmann::interpolateToNativeFES(ParGridFunction &output, Index index)
     const int loc_size_native = list_native_fes_[index]->GetNDofs();
     const int loc_size = list_fes_[index]->GetNDofs();
     for (int icomp(0); icomp < ncomps[index]; ++icomp) {
-      mfem::Vector view_output(output, icomp * loc_size_native,
-                                    loc_size_native);
+      mfem::Vector view_output(output, icomp * loc_size_native, loc_size_native);
       mfem::Vector view_field(*(fields_[index]), icomp * loc_size, loc_size);
       scalar_interpolator_to_nativeFES_->Mult(view_field, view_output);
     }
@@ -232,7 +227,7 @@ void Tps2Boltzmann::interpolateToNativeFES(ParGridFunction &output, Index index)
 }
 
 void Tps2Boltzmann::saveDataCollection(int cycle, double time) {
-  if ( paraview_dc) {
+  if (paraview_dc) {
     paraview_dc->SetCycle(cycle);
     paraview_dc->SetTime(time);
     paraview_dc->Save();
@@ -333,10 +328,8 @@ void tps2bolzmann(py::module &m) {
       .def("Nspecies", &TPS::Tps2Boltzmann::Nspecies)
       .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
       .def("nComponents", &TPS::Tps2Boltzmann::nComponents)
-      .def("saveDataCollection",
-           &TPS::Tps2Boltzmann::saveDataCollection,
-           "Save the data collection in Paraview format",
-           py::arg("cycle"), py::arg("time") );
+      .def("saveDataCollection", &TPS::Tps2Boltzmann::saveDataCollection, "Save the data collection in Paraview format",
+           py::arg("cycle"), py::arg("time"));
 }
 }  // namespace tps_wrappers
 #endif
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index c4138b755..71649d8d6 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -161,7 +161,7 @@ class Tps2Boltzmann {
   double EfieldAngularFreq_;
 
   bool save_to_paraview_dc;
-  mfem::ParaViewDataCollection * paraview_dc;
+  mfem::ParaViewDataCollection *paraview_dc;
 };
 }  // namespace TPS
 

From 804da189125753dabc182771a3fecd66b141b3cb Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Sat, 6 Jan 2024 21:45:04 -0600
Subject: [PATCH 21/75] Small edits

---
 src/M2ulPhyS.cpp | 5 +++++
 test/vpath.sh    | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index f529d9019..d858477ec 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -3319,6 +3319,11 @@ void M2ulPhyS::parseReactionInputs() {
       config.reactionModels[r - 1] = TABULATED_RXN;
       std::string inputPath(basepath + "/tabulated");
       readTable(inputPath, config.chemistryInput.reactionInputs[r - 1].tableInput);
+    } else if (model == "bte") {
+      config.reactionModels[r - 1] = GRIDFUNCTION_RXN;
+      int index;
+      tpsP->getRequiredInput((basepath + "index").c_str(), index);
+      config.chemistryInput.reactionInputs[r - 1].indexInput = index;
     } else {
       grvy_printf(GRVY_ERROR, "\nUnknown reaction_model -> %s", model.c_str());
       exit(ERROR);
diff --git a/test/vpath.sh b/test/vpath.sh
index a98808e8c..af1e5eead 100755
--- a/test/vpath.sh
+++ b/test/vpath.sh
@@ -30,7 +30,8 @@ if [ ! -d ref_solns ];then
 fi
 
 # necessary binaries
-binaries="bats die.sh soln_differ count_gpus.sh sniff_mpirun.sh ../src/tps.py ../src/tps-time-loop.py ../test/test_tps_splitcomm.py"
+binaries="bats die.sh soln_differ count_gpus.sh sniff_mpirun.sh "
+binaries+="../src/tps.py ../src/tps-time-loop.py ../src/tps-bte_0d3v.py ../test/test_tps_splitcomm.py"
 for binary in $binaries; do
     if [ ! -x $binary ];then
         if [ -x $testDir/$binary ];then

From 1bb3216091a915aa65c689e2348a1ddb5cdd9eca Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Sun, 7 Jan 2024 09:54:50 -0600
Subject: [PATCH 22/75] Let GridFunctionReaction return 0 if the gridfunction
 is not set.\n Use Arrhenius rates in the Mock Boltzmann solver

---
 src/reaction.cpp     |  5 ++++-
 src/tps-time-loop.py | 35 +++++++++++++++++++++++------------
 test/vpath.sh        |  2 +-
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index a8a984ef8..ae322c934 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -98,5 +98,8 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
                                                                      [[maybe_unused]] const double &T_e,
                                                                      const int &dofindex,
                                                                      [[maybe_unused]] const bool isElectronInvolved) {
-  return data[dofindex];
+  if(data)
+    return data[dofindex];
+  else
+    return 0.;
 }
diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 9bbc719d0..ce5a398a6 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -5,25 +5,36 @@
 
 from mpi4py import MPI
 
-class BoltzmannMockSolver:
+class ArrheniusSolver:
     def __init__(self):
-        pass
+        self.UNIVERSALGASCONSTANT = 8.3144598;  # J * mol^(-1) * K^(-1)
+        self.species_densities = None
+        self.efield = None
+        self.heavy_temperature = None
+        self.reaction_rates = [None, None]
+        #Reaction 1: 'Ar + E => Ar.+1 + 2 E', 
+        #Reaction 2: 'Ar.+1 + 2 E => Ar + E'
+        self.A = [74072.331348, 5.66683445516e-20]
+        self.b = [1.511, 0.368]
+        self.E = [1176329.772504, -377725.908714] # [J/mol]
 
     def fetch(self, interface):
-        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False)
-        efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
-        heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        self.species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False)
+        self.efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
+        self.heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+
 
-        print("|| species_densities ||_2 = ", np.linalg.norm(species_densities) )
-        print("|| efield ||_2 = ", np.linalg.norm(efield) )
-        print("||heavy_temperature||_2 = ", np.linalg.norm(heavy_temperature) )
 
     def solve(self):
-        pass
+        #A_ * pow(temp, b_) * exp(-E_ / UNIVERSALGASCONSTANT / temp);
+        self.reaction_rates = [A * np.pow(self.heavy_temperature, b) * 
+                               np.exp(-E/(self.UNIVERSALGASCONSTANT * self.heavy_temperature))
+                               for A,b,E in zip(self.A, self.b, self.E) ]
 
     def push(self, interface):
-        electron_temperature =  np.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        electron_temperature[:] = 1.
+        rates =  np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False)
+        rates[0:self.heavy_temperature.shape[0]] = self.reaction_rates[0]
+        rates[self.heavy_temperature.shape[0]:] = self.reaction_rates[1]
 
 
 
@@ -42,7 +53,7 @@ def push(self, interface):
 tps.chooseSolver()
 tps.initialize()
 
-boltzmann = BoltzmannMockSolver()
+boltzmann = ArrheniusSolver()
 
 interface = libtps.Tps2Boltzmann(tps)
 tps.initInterface(interface)
diff --git a/test/vpath.sh b/test/vpath.sh
index af1e5eead..26d070bc1 100755
--- a/test/vpath.sh
+++ b/test/vpath.sh
@@ -31,7 +31,7 @@ fi
 
 # necessary binaries
 binaries="bats die.sh soln_differ count_gpus.sh sniff_mpirun.sh "
-binaries+="../src/tps.py ../src/tps-time-loop.py ../src/tps-bte_0d3v.py ../test/test_tps_splitcomm.py"
+binaries+="../src/tps.py ../src/tps-time-loop.py ../cdsrc/tps-bte_0d3v.py ../test/test_tps_splitcomm.py"
 for binary in $binaries; do
     if [ ! -x $binary ];then
         if [ -x $testDir/$binary ];then

From 89c3b54fd741a01bb8dc3eeaa03d9e8d0faf95c7 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 10:34:31 -0600
Subject: [PATCH 23/75] Few extra tweaks, including computing the number of
 reaction involving BTE

---
 src/M2ulPhyS.cpp                     |  2 +-
 src/tps-time-loop.py                 | 14 ++++++++++----
 src/tps2Boltzmann.cpp                | 24 +++++++++++++++++++++---
 src/tps2Boltzmann.hpp                |  3 +++
 test/inputs/coupled-3d-boltzmann.ini |  2 +-
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index d858477ec..e441b9f08 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -3322,7 +3322,7 @@ void M2ulPhyS::parseReactionInputs() {
     } else if (model == "bte") {
       config.reactionModels[r - 1] = GRIDFUNCTION_RXN;
       int index;
-      tpsP->getRequiredInput((basepath + "index").c_str(), index);
+      tpsP->getRequiredInput((basepath + "bte/index").c_str(), index);
       config.chemistryInput.reactionInputs[r - 1].indexInput = index;
     } else {
       grvy_printf(GRVY_ERROR, "\nUnknown reaction_model -> %s", model.c_str());
diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index ce5a398a6..3038ac043 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -19,6 +19,9 @@ def __init__(self):
         self.E = [1176329.772504, -377725.908714] # [J/mol]
 
     def fetch(self, interface):
+        n_reactions =interface.nComponents(libtps.t2bIndex.ReactionRates)
+        for r in range(n_reactions):
+            print("Reaction ", r+1, ": ", interface.getReactionEquation(r))
         self.species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False)
         self.efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
         self.heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
@@ -27,14 +30,16 @@ def fetch(self, interface):
 
     def solve(self):
         #A_ * pow(temp, b_) * exp(-E_ / UNIVERSALGASCONSTANT / temp);
-        self.reaction_rates = [A * np.pow(self.heavy_temperature, b) * 
+        self.reaction_rates = [A * np.power(self.heavy_temperature, b) * 
                                np.exp(-E/(self.UNIVERSALGASCONSTANT * self.heavy_temperature))
                                for A,b,E in zip(self.A, self.b, self.E) ]
 
     def push(self, interface):
-        rates =  np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False)
-        rates[0:self.heavy_temperature.shape[0]] = self.reaction_rates[0]
-        rates[self.heavy_temperature.shape[0]:] = self.reaction_rates[1]
+        n_reactions =interface.nComponents(libtps.t2bIndex.ReactionRates)
+        if n_reactions >= 2:
+            rates =  np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False)
+            rates[0:self.heavy_temperature.shape[0]] = self.reaction_rates[0]
+            rates[self.heavy_temperature.shape[0]:] = self.reaction_rates[1]
 
 
 
@@ -69,6 +74,7 @@ def push(self, interface):
     boltzmann.fetch(interface)
     boltzmann.solve()
     boltzmann.push(interface)
+    interface.saveDataCollection(cycle=it, time=it)
     tps.fetch(interface)
     
     it = it+1
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 504c4560f..69eebf8f1 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -84,8 +84,7 @@ Tps2Boltzmann::Tps2Boltzmann(Tps *tps)
   assert(tps->isFlowEMCoupled());
 
   tps->getRequiredInput("species/numSpecies", nspecies_);
-  // TODO(Umberto): Get the number of reactions for the solver
-  tps->getRequiredInput("boltzmannInterface/nreactions", nreactions_);
+  nreactions_ = _countBTEReactions();
   tps->getRequiredInput("boltzmannInterface/order", order_);
   tps->getRequiredInput("boltzmannInterface/basisType", basis_type_);
   assert(basis_type_ == 0 || basis_type_ == 1);
@@ -99,6 +98,24 @@ Tps2Boltzmann::Tps2Boltzmann(Tps *tps)
   ncomps.SetSize(NIndexes + 1);
 }
 
+int Tps2Boltzmann::_countBTEReactions() {
+  int total_reactions(0);
+  int bte_reactions(0);
+  tps_->getRequiredInput("reactions/number_of_reactions", total_reactions);
+  reaction_eqs_.reserve(total_reactions);
+  for ( int r(0); r<total_reactions; ++r) {
+    std::string basepath("reactions/reaction" + std::to_string(r+1));
+    std::string equation, model;
+    tps_->getRequiredInput((basepath + "/equation").c_str(), equation);
+    tps_->getRequiredInput((basepath + "/model").c_str(), model);
+    if ( model == "bte" ) {
+      ++bte_reactions;
+      reaction_eqs_.push_back(equation);
+    }
+  }
+  return bte_reactions;
+}
+
 void Tps2Boltzmann::init(M2ulPhyS *flowSolver) {
   std::cout << "Tps2Boltzmann::init is called" << std::endl;
   mfem::ParMesh *pmesh(flowSolver->GetMesh());
@@ -329,7 +346,8 @@ void tps2bolzmann(py::module &m) {
       .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
       .def("nComponents", &TPS::Tps2Boltzmann::nComponents)
       .def("saveDataCollection", &TPS::Tps2Boltzmann::saveDataCollection, "Save the data collection in Paraview format",
-           py::arg("cycle"), py::arg("time"));
+           py::arg("cycle"), py::arg("time"))
+      .def("getReactionEquation", &TPS::Tps2Boltzmann::getReactionEquation, "Return the equation of the reaction", py::arg("index"));
 }
 }  // namespace tps_wrappers
 #endif
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index 71649d8d6..82d4e8ee7 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -114,12 +114,14 @@ class Tps2Boltzmann {
   int Nspecies() const { return nspecies_; }
   int NeFieldComps() const { return nEfieldComps_; }
   int nComponents(Index index) const { return ncomps[index]; }
+  std::string getReactionEquation(int index) const { return reaction_eqs_[index]; }
 
   void saveDataCollection(int cycle, double time);
 
   ~Tps2Boltzmann();
 
  private:
+  int _countBTEReactions();
   Tps *tps_;
 
   int nspecies_;
@@ -162,6 +164,7 @@ class Tps2Boltzmann {
 
   bool save_to_paraview_dc;
   mfem::ParaViewDataCollection *paraview_dc;
+  std::vector<std::string> reaction_eqs_;
 };
 }  // namespace TPS
 
diff --git a/test/inputs/coupled-3d-boltzmann.ini b/test/inputs/coupled-3d-boltzmann.ini
index 9764096b2..980e62cde 100644
--- a/test/inputs/coupled-3d-boltzmann.ini
+++ b/test/inputs/coupled-3d-boltzmann.ini
@@ -141,6 +141,6 @@ permeability = 1.25663706e-6     # m * kg / s^2 / A^2
 preconditioner_background_sigma = 0.01
 
 [boltzmannInterface]
-nreactions = 1
+save_to_paraview = true
 order = 0
 basisType = 0
\ No newline at end of file

From 062a1eda649171fdfbb7a0d77b0b4bd2966831be Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 10:36:25 -0600
Subject: [PATCH 24/75] make enforcestyle

---
 src/reaction.cpp      | 2 +-
 src/tps2Boltzmann.cpp | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index ae322c934..215f55015 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -98,7 +98,7 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
                                                                      [[maybe_unused]] const double &T_e,
                                                                      const int &dofindex,
                                                                      [[maybe_unused]] const bool isElectronInvolved) {
-  if(data)
+  if (data)
     return data[dofindex];
   else
     return 0.;
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 69eebf8f1..9c13517e0 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -103,12 +103,12 @@ int Tps2Boltzmann::_countBTEReactions() {
   int bte_reactions(0);
   tps_->getRequiredInput("reactions/number_of_reactions", total_reactions);
   reaction_eqs_.reserve(total_reactions);
-  for ( int r(0); r<total_reactions; ++r) {
-    std::string basepath("reactions/reaction" + std::to_string(r+1));
+  for (int r(0); r < total_reactions; ++r) {
+    std::string basepath("reactions/reaction" + std::to_string(r + 1));
     std::string equation, model;
     tps_->getRequiredInput((basepath + "/equation").c_str(), equation);
     tps_->getRequiredInput((basepath + "/model").c_str(), model);
-    if ( model == "bte" ) {
+    if (model == "bte") {
       ++bte_reactions;
       reaction_eqs_.push_back(equation);
     }
@@ -347,7 +347,8 @@ void tps2bolzmann(py::module &m) {
       .def("nComponents", &TPS::Tps2Boltzmann::nComponents)
       .def("saveDataCollection", &TPS::Tps2Boltzmann::saveDataCollection, "Save the data collection in Paraview format",
            py::arg("cycle"), py::arg("time"))
-      .def("getReactionEquation", &TPS::Tps2Boltzmann::getReactionEquation, "Return the equation of the reaction", py::arg("index"));
+      .def("getReactionEquation", &TPS::Tps2Boltzmann::getReactionEquation, "Return the equation of the reaction",
+           py::arg("index"));
 }
 }  // namespace tps_wrappers
 #endif

From c94674c15ed192615841009fbe3092b1706eeac2 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 12:11:07 -0600
Subject: [PATCH 25/75] Bugfixes

---
 src/M2ulPhyS.cpp      |  4 ++--
 src/reaction.cpp      | 16 ++++++++++------
 src/reaction.hpp      |  1 +
 src/tps-time-loop.py  |  3 +++
 src/tps2Boltzmann.cpp |  2 +-
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index e441b9f08..99c56f144 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -3322,7 +3322,7 @@ void M2ulPhyS::parseReactionInputs() {
     } else if (model == "bte") {
       config.reactionModels[r - 1] = GRIDFUNCTION_RXN;
       int index;
-      tpsP->getRequiredInput((basepath + "bte/index").c_str(), index);
+      tpsP->getRequiredInput((basepath + "/bte/index").c_str(), index);
       config.chemistryInput.reactionInputs[r - 1].indexInput = index;
     } else {
       grvy_printf(GRVY_ERROR, "\nUnknown reaction_model -> %s", model.c_str());
@@ -3442,7 +3442,7 @@ void M2ulPhyS::parseReactionInputs() {
             config.equilibriumConstantParams[p + r * gpudata::MAXCHEMPARAMS];
       }
 
-      if (config.reactionModels[r] != TABULATED_RXN) {
+      if (config.reactionModels[r] == ARRHENIUS || config.reactionModels[r] == HOFFERTLIEN) {
         assert(rxn_param_idx < config.rxnModelParamsHost.size());
         config.chemistryInput.reactionInputs[r].modelParams = config.rxnModelParamsHost[rxn_param_idx].Read();
         rxn_param_idx += 1;
diff --git a/src/reaction.cpp b/src/reaction.cpp
index 215f55015..6ca9355f6 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -80,17 +80,19 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
 }
 
 MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
-    : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp) {}
+    : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp), size_(0) {}
 
 MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 
 void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
   f_ = f;
-  assert(f->Size() >= (comp + 1) * f->FESpace()->GetNDofs());
+  size_ = f->FESpace()->GetNDofs();
+  assert(comp < f->FESpace()->GetVDim() );
+  assert(f->FESpace()->GetOrdering() == mfem::Ordering::byNodes);
 #ifdef _GPU_
-  data = f_->Read() + comp * f_->FESpace()->GetNDofs();
+  data = f_->Read() + comp * size_;
 #else
-  data = f_->HostRead() + comp * f_->FESpace()->GetNDofs();
+  data = f_->HostRead() + comp * size_;
 #endif
 }
 
@@ -98,8 +100,10 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
                                                                      [[maybe_unused]] const double &T_e,
                                                                      const int &dofindex,
                                                                      [[maybe_unused]] const bool isElectronInvolved) {
-  if (data)
+  if (data) {
+    assert(dofindex < size_)
     return data[dofindex];
-  else
+  }
+  else 
     return 0.;
 }
diff --git a/src/reaction.hpp b/src/reaction.hpp
index daf0406ed..623312ca0 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -123,6 +123,7 @@ class GridFunctionReaction : public Reaction {
   std::shared_ptr<mfem::GridFunction> f_;
   const double *data;
   const int comp;
+  int size_;
 
  public:
   MFEM_HOST_DEVICE GridFunctionReaction(int comp);
diff --git a/src/tps-time-loop.py b/src/tps-time-loop.py
index 3038ac043..af2efc6c8 100755
--- a/src/tps-time-loop.py
+++ b/src/tps-time-loop.py
@@ -26,6 +26,9 @@ def fetch(self, interface):
         self.efield = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False)
         self.heavy_temperature = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
 
+        efieldAngularFreq = interface.EfieldAngularFreq()
+        print("Electric field angular frequency: ", efieldAngularFreq)
+
 
 
     def solve(self):
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index 9c13517e0..b2cceba63 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -92,7 +92,7 @@ Tps2Boltzmann::Tps2Boltzmann(Tps *tps)
   tps->getRequiredInput("em/current_frequency", EfieldAngularFreq_);
   EfieldAngularFreq_ *= 2. * M_PI;
 
-  tps->getInput("boltzmannInterface/save_to_paraview", save_to_paraview_dc);
+  save_to_paraview_dc = tps->getInput("boltzmannInterface/save_to_paraview", false);
 
   offsets.SetSize(NIndexes + 1);
   ncomps.SetSize(NIndexes + 1);

From e65671a2ca95d25f897c90262ac2b13c22f6efcb Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 12:18:55 -0600
Subject: [PATCH 26/75] Fix compilation typos

---
 src/reaction.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index 6ca9355f6..533ee4b5e 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -88,7 +88,7 @@ void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunc
   f_ = f;
   size_ = f->FESpace()->GetNDofs();
   assert(comp < f->FESpace()->GetVDim() );
-  assert(f->FESpace()->GetOrdering() == mfem::Ordering::byNodes);
+  assert(f->FESpace()->GetOrdering() == mfem::Ordering::byNODES);
 #ifdef _GPU_
   data = f_->Read() + comp * size_;
 #else
@@ -101,7 +101,7 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
                                                                      const int &dofindex,
                                                                      [[maybe_unused]] const bool isElectronInvolved) {
   if (data) {
-    assert(dofindex < size_)
+    assert(dofindex < size_);
     return data[dofindex];
   }
   else 

From 5ac56172e3b27df5023191cf31a8b81712152af6 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 12:20:56 -0600
Subject: [PATCH 27/75] make enforcestyle

---
 src/reaction.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index 533ee4b5e..7ae59707a 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -87,7 +87,7 @@ MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
   f_ = f;
   size_ = f->FESpace()->GetNDofs();
-  assert(comp < f->FESpace()->GetVDim() );
+  assert(comp < f->FESpace()->GetVDim());
   assert(f->FESpace()->GetOrdering() == mfem::Ordering::byNODES);
 #ifdef _GPU_
   data = f_->Read() + comp * size_;
@@ -103,7 +103,6 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
   if (data) {
     assert(dofindex < size_);
     return data[dofindex];
-  }
-  else 
+  } else
     return 0.;
 }

From bb2e7c8f0bdc47517113a546c5353edcdc068cde Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 12:22:25 -0600
Subject: [PATCH 28/75] make style

---
 src/reaction.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index 7ae59707a..ac8affe82 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -103,6 +103,7 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
   if (data) {
     assert(dofindex < size_);
     return data[dofindex];
-  } else
+  } else {
     return 0.;
+  }
 }

From b15431237ac963e3dc834f83d23406a6b7682060 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 14:57:02 -0600
Subject: [PATCH 29/75] Avoid compilation error on Tioga

---
 src/reaction.cpp | 4 ++--
 src/reaction.hpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/reaction.cpp b/src/reaction.cpp
index ac8affe82..ed2a0748f 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -79,10 +79,10 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
   return table_->eval(temp);
 }
 
-MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
+GridFunctionReaction::GridFunctionReaction(int comp)
     : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp), size_(0) {}
 
-MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
+GridFunctionReaction::~GridFunctionReaction() {}
 
 void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
   f_ = f;
diff --git a/src/reaction.hpp b/src/reaction.hpp
index 623312ca0..3b3c27e73 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -126,9 +126,9 @@ class GridFunctionReaction : public Reaction {
   int size_;
 
  public:
-  MFEM_HOST_DEVICE GridFunctionReaction(int comp);
+  GridFunctionReaction(int comp);
 
-  MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
+  virtual ~GridFunctionReaction();
 
   void setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f);
 

From 1771a7968fd5f3da8cec8e89f9013f74852bdc84 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 15:02:55 -0600
Subject: [PATCH 30/75] Avoid compilation error on Tioga

---
 src/chemistry.cpp | 2 +-
 src/chemistry.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index f233d2ed5..9f64284de 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -37,7 +37,7 @@ using namespace std;
 
 Chemistry::Chemistry(GasMixture *mixture, RunConfiguration &config) : Chemistry(mixture, config.chemistryInput) {}
 
-MFEM_HOST_DEVICE Chemistry::Chemistry(GasMixture *mixture, const ChemistryInput &inputs) : mixture_(mixture) {
+Chemistry::Chemistry(GasMixture *mixture, const ChemistryInput &inputs) : mixture_(mixture) {
   numEquations_ = mixture->GetNumEquations();
   numSpecies_ = mixture->GetNumSpecies();
   numActiveSpecies_ = mixture->GetNumActiveSpecies();
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 0463f0f22..4c190a3f6 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -91,7 +91,7 @@ class Chemistry {
 
  public:
   Chemistry(GasMixture *mixture, RunConfiguration &config);
-  MFEM_HOST_DEVICE Chemistry(GasMixture *mixture, const ChemistryInput &inputs);
+  Chemistry(GasMixture *mixture, const ChemistryInput &inputs);
 
   MFEM_HOST_DEVICE ~Chemistry();
 

From e7d37558b9c29fc8c72a9a887373688f66b8e11d Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 15:13:57 -0600
Subject: [PATCH 31/75] Restore MFEM_HOST_DEVICE in the
 constructor/deconstructor of Reaction/Chemestry

---
 src/chemistry.cpp | 2 +-
 src/chemistry.hpp | 2 +-
 src/reaction.cpp  | 4 ++--
 src/reaction.hpp  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index 9f64284de..f233d2ed5 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -37,7 +37,7 @@ using namespace std;
 
 Chemistry::Chemistry(GasMixture *mixture, RunConfiguration &config) : Chemistry(mixture, config.chemistryInput) {}
 
-Chemistry::Chemistry(GasMixture *mixture, const ChemistryInput &inputs) : mixture_(mixture) {
+MFEM_HOST_DEVICE Chemistry::Chemistry(GasMixture *mixture, const ChemistryInput &inputs) : mixture_(mixture) {
   numEquations_ = mixture->GetNumEquations();
   numSpecies_ = mixture->GetNumSpecies();
   numActiveSpecies_ = mixture->GetNumActiveSpecies();
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 4c190a3f6..0463f0f22 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -91,7 +91,7 @@ class Chemistry {
 
  public:
   Chemistry(GasMixture *mixture, RunConfiguration &config);
-  Chemistry(GasMixture *mixture, const ChemistryInput &inputs);
+  MFEM_HOST_DEVICE Chemistry(GasMixture *mixture, const ChemistryInput &inputs);
 
   MFEM_HOST_DEVICE ~Chemistry();
 
diff --git a/src/reaction.cpp b/src/reaction.cpp
index ed2a0748f..ac8affe82 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -79,10 +79,10 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
   return table_->eval(temp);
 }
 
-GridFunctionReaction::GridFunctionReaction(int comp)
+MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
     : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp), size_(0) {}
 
-GridFunctionReaction::~GridFunctionReaction() {}
+MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 
 void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
   f_ = f;
diff --git a/src/reaction.hpp b/src/reaction.hpp
index 3b3c27e73..623312ca0 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -126,9 +126,9 @@ class GridFunctionReaction : public Reaction {
   int size_;
 
  public:
-  GridFunctionReaction(int comp);
+  MFEM_HOST_DEVICE GridFunctionReaction(int comp);
 
-  virtual ~GridFunctionReaction();
+  MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
 
   void setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f);
 

From 65e892d900172a60d8d3499db7da62eebfea8715 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 8 Jan 2024 16:40:40 -0600
Subject: [PATCH 32/75] Fix incorrect allocation and setting of external rates
 data on device

---
 src/M2ulPhyS.hpp           |  3 +++
 src/M2ulPhyS2Boltzmann.cpp | 13 ++++++++++---
 src/chemistry.cpp          | 13 +++++++++++--
 src/chemistry.hpp          |  3 ++-
 src/gpu_constructor.cpp    | 11 +++++++++++
 src/gpu_constructor.hpp    |  4 ++++
 src/reaction.cpp           | 24 ++++++++++++++----------
 src/reaction.hpp           |  9 +++++----
 8 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/src/M2ulPhyS.hpp b/src/M2ulPhyS.hpp
index 934131487..345687302 100644
--- a/src/M2ulPhyS.hpp
+++ b/src/M2ulPhyS.hpp
@@ -250,6 +250,9 @@ class M2ulPhyS : public TPS::Solver {
   // ParNonlinearForm *gradUp_A;
   GradNonLinearForm *gradUp_A;
 
+  // Auxiliary grid function to store external reaction rates
+  std::unique_ptr<ParGridFunction> externalReactionRates;
+
   // Average handler
   Averaging *average;
 
diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index 4fa594e5e..e54dfdb01 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -85,7 +85,14 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
 
 void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) {
   mfem::ParFiniteElementSpace *reaction_rates_fes(&(interface.NativeFes(TPS::Tps2Boltzmann::Index::ReactionRates)));
-  std::shared_ptr<mfem::ParGridFunction> reaction_rates(new mfem::ParGridFunction(reaction_rates_fes));
-  interface.interpolateToNativeFES(*reaction_rates, TPS::Tps2Boltzmann::Index::ReactionRates);
-  chemistry_->setGridFunctionRates(reaction_rates);
+  externalReactionRates.reset(new mfem::ParGridFunction(reaction_rates_fes));
+  interface.interpolateToNativeFES(*externalReactionRates, TPS::Tps2Boltzmann::Index::ReactionRates);
+  #if _GPU_
+    const double * data(externalReactionRates->Read() );
+    int size(externalReactionRates->FESpace()->GetNDofs() );
+    assert(externalReactionRates->FESpace()->GetOrdering() == mfem::Ordering::byNODES);
+    gpu::deviceSetChemistryReactionData<<<1, 1>>>(data, size, chemistry_);
+  #else
+    chemistry_->setGridFunctionRates(*externalReactionRates);
+  #endif
 }
diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index f233d2ed5..57704f345 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -109,11 +109,20 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
   }
 }
 
-void Chemistry::setGridFunctionRates(std::shared_ptr<mfem::ParGridFunction> &f) {
+MFEM_HOST_DEVICE void Chemistry::setRates(const double * data, int size) {
   for (int r = 0; r < numReactions_; r++) {
     if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
       GridFunctionReaction *rx = dynamic_cast<GridFunctionReaction *>(reactions_[r]);
-      rx->setGridFunctionData(f);
+      rx->setData(data, size);
+    }
+  }
+}
+
+void Chemistry::setGridFunctionRates(mfem::GridFunction &f) {
+  for (int r = 0; r < numReactions_; r++) {
+    if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
+      GridFunctionReaction *rx = dynamic_cast<GridFunctionReaction *>(reactions_[r]);
+      rx->setGridFunction(f);
     }
   }
 }
diff --git a/src/chemistry.hpp b/src/chemistry.hpp
index 0463f0f22..906d249af 100644
--- a/src/chemistry.hpp
+++ b/src/chemistry.hpp
@@ -96,7 +96,8 @@ class Chemistry {
   MFEM_HOST_DEVICE ~Chemistry();
 
   // Set the grid function rates for GRIDFUNCTION_RXN reaction types
-  void setGridFunctionRates(std::shared_ptr<mfem::ParGridFunction> &f);
+  void setGridFunctionRates(mfem::GridFunction &f);
+  MFEM_HOST_DEVICE void setRates(const double * data, int size);
 
   // return Vector of reaction rate coefficients, with the size of numReaction_.
   // WARNING(marc) I have removed "virtual" qualifier here assuming these functions will not
diff --git a/src/gpu_constructor.cpp b/src/gpu_constructor.cpp
index c6143c51f..af49d98b6 100644
--- a/src/gpu_constructor.cpp
+++ b/src/gpu_constructor.cpp
@@ -121,5 +121,16 @@ __global__ void freeDeviceRadiation(Radiation *radiation) {
   if (radiation != NULL) radiation->~Radiation();
 }
 
+//---------------------------------------------------
+// And finally devise setters
+//---------------------------------------------------
+__global__ void deviceSetGridFunctionReactionData(const double * data, int size, GridFunctionReaction * reaction) {
+  reaction->setData(ata, size);
+}
+
+__global__ void deviceSetChemistryReactionData(const double * data, int size, Chemistry * chem) {
+  chem->setRates(data, size);
+}
+
 #endif  // cuda or hip
 }  // namespace gpu
diff --git a/src/gpu_constructor.hpp b/src/gpu_constructor.hpp
index ca4695383..64e98d34d 100644
--- a/src/gpu_constructor.hpp
+++ b/src/gpu_constructor.hpp
@@ -160,6 +160,10 @@ __global__ void freeDeviceChemistry(Chemistry *chem);
 //! Explicit call to Radiation destructor on the device
 __global__ void freeDeviceRadiation(Radiation *radiation);
 
+//! Set the data to a GridFunctionReaction
+__global__ void deviceSetGridFunctionReactionData(const double * data, int size, GridFunctionReaction * reaction);
+__global__ void deviceSetChemistryReactionData(const double * data, int size, Chemistry * chem);
+
 #endif  // cuda or hip
 }  // namespace gpu
 
diff --git a/src/reaction.cpp b/src/reaction.cpp
index ac8affe82..5abdd1512 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -80,19 +80,23 @@ MFEM_HOST_DEVICE double Tabulated::computeRateCoefficient(const double &T_h, con
 }
 
 MFEM_HOST_DEVICE GridFunctionReaction::GridFunctionReaction(int comp)
-    : Reaction(GRIDFUNCTION_RXN), data(nullptr), comp(comp), size_(0) {}
+    : Reaction(GRIDFUNCTION_RXN), data_(nullptr), comp_(comp), size_(0) {}
 
 MFEM_HOST_DEVICE GridFunctionReaction::~GridFunctionReaction() {}
 
-void GridFunctionReaction::setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f) {
-  f_ = f;
-  size_ = f->FESpace()->GetNDofs();
-  assert(comp < f->FESpace()->GetVDim());
-  assert(f->FESpace()->GetOrdering() == mfem::Ordering::byNODES);
+MFEM_HOST_DEVICE void GridFunctionReaction::setData(const double * data, int size) {
+  data_ = data + comp_ * size_;
+  size_ = size;
+}
+
+void GridFunctionReaction::setGridFunction(const mfem::GridFunction & f) {
+  size_ = f.FESpace()->GetNDofs();
+  assert(comp_ < f.FESpace()->GetVDim());
+  assert(f.FESpace()->GetOrdering() == mfem::Ordering::byNODES);
 #ifdef _GPU_
-  data = f_->Read() + comp * size_;
+  data_ = f.Read() + comp_ * size_;
 #else
-  data = f_->HostRead() + comp * size_;
+  data_ = f.HostRead() + comp_ * size_;
 #endif
 }
 
@@ -100,9 +104,9 @@ MFEM_HOST_DEVICE double GridFunctionReaction::computeRateCoefficient([[maybe_unu
                                                                      [[maybe_unused]] const double &T_e,
                                                                      const int &dofindex,
                                                                      [[maybe_unused]] const bool isElectronInvolved) {
-  if (data) {
+  if (data_) {
     assert(dofindex < size_);
-    return data[dofindex];
+    return data_[dofindex];
   } else {
     return 0.;
   }
diff --git a/src/reaction.hpp b/src/reaction.hpp
index 623312ca0..c563e0dd8 100644
--- a/src/reaction.hpp
+++ b/src/reaction.hpp
@@ -120,9 +120,8 @@ class Tabulated : public Reaction {
 
 class GridFunctionReaction : public Reaction {
  private:
-  std::shared_ptr<mfem::GridFunction> f_;
-  const double *data;
-  const int comp;
+  const double *data_;
+  const int comp_;
   int size_;
 
  public:
@@ -130,7 +129,9 @@ class GridFunctionReaction : public Reaction {
 
   MFEM_HOST_DEVICE virtual ~GridFunctionReaction();
 
-  void setGridFunctionData(std::shared_ptr<mfem::ParGridFunction> &f);
+  void setGridFunction(const mfem::GridFunction & f);
+
+  MFEM_HOST_DEVICE void setData(const double * data, int size);
 
   MFEM_HOST_DEVICE virtual double computeRateCoefficient([[maybe_unused]] const double &T_h,
                                                          [[maybe_unused]] const double &T_e, const int &dofindex,

From 332e4e721ff7ecaf14062da9fec04899608c429f Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Tue, 9 Jan 2024 09:07:01 -0600
Subject: [PATCH 33/75] Fix compilation on device. No dynamic_cast

---
 src/chemistry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chemistry.cpp b/src/chemistry.cpp
index 57704f345..fed7512d3 100644
--- a/src/chemistry.cpp
+++ b/src/chemistry.cpp
@@ -112,7 +112,7 @@ MFEM_HOST_DEVICE Chemistry::~Chemistry() {
 MFEM_HOST_DEVICE void Chemistry::setRates(const double * data, int size) {
   for (int r = 0; r < numReactions_; r++) {
     if (reactions_[r]->reactionModel == GRIDFUNCTION_RXN) {
-      GridFunctionReaction *rx = dynamic_cast<GridFunctionReaction *>(reactions_[r]);
+      GridFunctionReaction *rx = (GridFunctionReaction *)(reactions_[r]);
       rx->setData(data, size);
     }
   }

From 88856bc97aeab58995327990656f22d32e3cc3ff Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Tue, 9 Jan 2024 10:36:10 -0600
Subject: [PATCH 34/75] Fix variable name typo

---
 src/gpu_constructor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu_constructor.cpp b/src/gpu_constructor.cpp
index af49d98b6..6dbb5de1e 100644
--- a/src/gpu_constructor.cpp
+++ b/src/gpu_constructor.cpp
@@ -125,7 +125,7 @@ __global__ void freeDeviceRadiation(Radiation *radiation) {
 // And finally devise setters
 //---------------------------------------------------
 __global__ void deviceSetGridFunctionReactionData(const double * data, int size, GridFunctionReaction * reaction) {
-  reaction->setData(ata, size);
+  reaction->setData(data, size);
 }
 
 __global__ void deviceSetChemistryReactionData(const double * data, int size, Chemistry * chem) {

From d5733d5b303edeb914c13c2fa010ad6f56184edf Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Tue, 9 Jan 2024 16:28:02 -0600
Subject: [PATCH 35/75] Bugfix for gpu-cpu build

---
 src/M2ulPhyS2Boltzmann.cpp | 2 +-
 src/reaction.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index e54dfdb01..60c9f450b 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -87,7 +87,7 @@ void M2ulPhyS::fetch(TPS::Tps2Boltzmann &interface) {
   mfem::ParFiniteElementSpace *reaction_rates_fes(&(interface.NativeFes(TPS::Tps2Boltzmann::Index::ReactionRates)));
   externalReactionRates.reset(new mfem::ParGridFunction(reaction_rates_fes));
   interface.interpolateToNativeFES(*externalReactionRates, TPS::Tps2Boltzmann::Index::ReactionRates);
-  #if _GPU_
+  #if defined(_CUDA_) || defined(_HIP_)
     const double * data(externalReactionRates->Read() );
     int size(externalReactionRates->FESpace()->GetNDofs() );
     assert(externalReactionRates->FESpace()->GetOrdering() == mfem::Ordering::byNODES);
diff --git a/src/reaction.cpp b/src/reaction.cpp
index 5abdd1512..18efb6072 100644
--- a/src/reaction.cpp
+++ b/src/reaction.cpp
@@ -93,7 +93,7 @@ void GridFunctionReaction::setGridFunction(const mfem::GridFunction & f) {
   size_ = f.FESpace()->GetNDofs();
   assert(comp_ < f.FESpace()->GetVDim());
   assert(f.FESpace()->GetOrdering() == mfem::Ordering::byNODES);
-#ifdef _GPU_
+#if defined(_CUDA_) || defined(_HIP_)
   data_ = f.Read() + comp_ * size_;
 #else
   data_ = f.HostRead() + comp_ * size_;

From 7d5e17031474a8f9ca49821b3a6de050fd4fd70c Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 24 Jan 2024 08:59:23 -0600
Subject: [PATCH 36/75] code clean up for 2w-coupling, grid setup moved to a
 new function.

---
 src/tps-bte_0d3v.py | 595 +++++++++++++++++++++++---------------------
 1 file changed, 310 insertions(+), 285 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index ca6f2670f..732bd41a6 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -145,10 +145,8 @@ def __init__(self, tps, comm):
         self.comm : MPI.Comm  = comm
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
-        self.parse_config_file(sys.argv[2])
-        
+        self.__parse_config_file__(sys.argv[2])
         self.xp_module          = np
-        
         boltzmann_dir           = self.param.output_dir
         isExist = os.path.exists(boltzmann_dir)
         if not isExist:
@@ -163,10 +161,13 @@ def __init__(self, tps, comm):
         
         self.profile_tt = profile_tt
         self.profile_nn = profile_nn
+        
+        # how to map each grid to the GPU devices on the node
+        self.gidx_to_device_map = lambda gidx, num_grids : gidx % 2
 
         return
     
-    def parse_config_file(self, fname):
+    def __parse_config_file__(self, fname):
         """
         add the configuaraion file parse code here, 
         which overides the default BoltzmannSolverParams
@@ -209,7 +210,7 @@ def grid_setup(self, interface):
         where, at the moment the clustering is determined based on the electron temperature
         computed from the TPS code. 
         """
-        
+        assert self.xp_module==np, "grid setup only supported in CPU"
         self.profile_tt[pp.SETUP].start()
         
         xp                = self.xp_module
@@ -261,48 +262,113 @@ def grid_setup(self, interface):
         for grid_idx in range(self.param.n_grids):
             print("setting up grid %d"%(grid_idx), flush = True)
             self.bte_solver.assemble_operators(grid_idx)
+            
+        n_grids              = self.param.n_grids
+        gidx_to_device_map   = self.gidx_to_device_map
+
+        for grid_idx in range(n_grids):
+            assert self.grid_idx_to_npts[grid_idx] > 0
+
+            print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
+            f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+            
+            print("setting grid %d to device %d"%(grid_idx, gidx_to_device_map(grid_idx, n_grids)))
+            self.bte_solver.host_to_device_setup(gidx_to_device_map(grid_idx, n_grids), grid_idx)
+            self.xp_module = cp
         
         self.profile_tt[pp.SETUP].stop()
         return
+
+    def __efield_setup__(self):
+        
+        """
+        Here we set the E-field for 1-way coupling
+        """
+        
+        use_gpu   = self.param.use_gpu
+        n_grids   = self.param.n_grids
+        xp        = self.xp_module
+        if(use_gpu==1):
+            def Et(t, grid_idx):
+                dev_id = self.gidx_to_device_map(grid_idx, n_grids)
+                with cp.cuda.Device(dev_id):
+                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+
+                    if self.param.Efreq == 0:
+                        return xp.sqrt(eRe_d**2 + eIm_d**2)
+                    else:
+                        return eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+        else:
+            def Et(t, grid_idx):
+                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+
+                if self.param.Efreq == 0:
+                    return xp.sqrt(eRe_d**2 + eIm_d**2)
+                else:
+                    return eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
+                            
+        for grid_idx in range(n_grids):
+            et = lambda t, gid=grid_idx: Et(t, gid)
+            self.bte_solver.set_efield_function(grid_idx, et)   
+        
+        return
         
     def fetch(self, interface):
-        xp                = self.xp_module
         gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
         
-        heavy_temp        = xp.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        heavy_temp        = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
         tps_npts          = len(heavy_temp)
         self.tps_npts     = tps_npts
         
-        electron_temp     = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
-        efield            = xp.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
-        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
+        electron_temp     = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        efield            = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
+        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
+        n_grids           = self.param.n_grids 
+        use_gpu           = self.param.use_gpu
         
-        for grid_idx in range(self.param.n_grids):
+        for grid_idx in range(n_grids):
             bte_idx           = gidx_to_pidx_map[grid_idx]
+            dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+            
             ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
             ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
             n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
             Tg                = heavy_temp[bte_idx]
             Te                = electron_temp[bte_idx]
-            
-            
             eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
             eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
+            
             eMag              = np.sqrt(eRe**2 + eIm **2)
             eByn0             = eMag/n0/self.param.Td_fac
-        
+            
             if self.param.verbose == 1 :
                 print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
                 print("Efreq = %.4E [1/s]" %(self.param.Efreq))
                 print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                 
                 print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg), np.max(Tg)))
-                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te), np.max(Te)))
+                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
+                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
                 
-                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne), np.max(ne)))
-                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni), np.max(ni)))
-                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0), np.max(n0)))
+                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
+                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
+            
+            if (use_gpu == 1):
+                with cp.cuda.Device(dev_id):
+                    ne        = cp.array(ne)
+                    ni        = cp.array(ni)
+                    n0        = cp.array(n0)
+                    Tg        = cp.array(Tg)
+                    Te        = cp.array(Te)
+                    eRe       = cp.array(eRe)
+                    eIm       = cp.array(eIm)
+                     
+                    eMag      = cp.sqrt(eRe**2 + eIm **2)
+                    eByn0     = eMag/n0/self.param.Td_fac
             
             #self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
             self.bte_solver.set_boltzmann_parameter(grid_idx, "n0", n0)
@@ -321,323 +387,282 @@ def solve(self):
         """
         
         if WITH_PARLA==1:
-            self.solve_with_parla()
+            self.solve_w_parla()
             return
         else:
-            self.solve_seq()
+            self.solve_wo_parla()
             return
         
-    def solve_seq(self):
-        xp               = self.xp_module
-        csv_write        = self.param.export_csv
-        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+    def solve_wo_parla(self):
+        xp                      = self.xp_module
+        csv_write               = self.param.export_csv
+        plot_data               = self.param.plot_data
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        use_gpu                 = self.param.use_gpu
+        dev_id                  = self.param.dev_id
+        verbose                 = self.param.verbose
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
         
         self.qoi         = [None for grid_idx in range(self.param.n_grids)]
         self.ff          = [None for grid_idx in range(self.param.n_grids)]
         
-        if csv_write ==1 : 
+        if csv_write: 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
         
-        t1 = time()
+        self.__efield_setup__()
         
-        for grid_idx in range(self.param.n_grids):
-            
-            if self.grid_idx_to_npts[grid_idx] ==0:
-                continue
-            
-            if self.param.verbose==1:
-                print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-                f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
-                self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
-            
-            if self.param.use_gpu==1:
-                dev_id   = self.param.dev_id
-                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
-                
-                with cp.cuda.Device(dev_id):
-                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-
+        t1 = time()
+        for grid_idx in range(n_grids):
+            dev_id   = gidx_to_device_map(grid_idx, n_grids)
+            if (use_gpu==0):
+                try:
+                    eRe_d    = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                    eIm_d    = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
                     if self.param.Efreq == 0:
                         ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
                     else:
                         ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                        
+                    
+                    self.bte_solver.set_efield_function(grid_idx, ef_t)
+                    
+                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                    ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                    self.qoi[grid_idx] = qoi
+                    self.ff [grid_idx] = ff
+                except:
+                    print("solver failed for v-space gird no %d"%(grid_idx))
+                    sys.exit(0)
             else:
-                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-            
-                if self.param.Efreq == 0:
-                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                else:
-                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                            
-            self.bte_solver.set_efield_function(grid_idx, ef_t)            
-            f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-            try:
-                ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                self.qoi[grid_idx] = qoi
-                self.ff [grid_idx] = ff
-            except:
-                print("solver failed for v-space gird no %d"%(grid_idx))
-                # self.qoi.append(None)
-                # continue
-                sys.exit(0)
-            
-            if self.param.export_csv ==0 and self.param.plot_data==0:
-                continue
-            
-            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
-            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
-
-            if self.param.use_gpu==1:
-                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
-                
-            with cp.cuda.Device(dev_id):
-                ff_r     = cp.asnumpy(ff_r)
-                for k, v in qoi.items():
-                    qoi[k] = cp.asnumpy(v)
+                with xp.cuda.Device(dev_id):
+                    f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                    ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                    self.qoi[grid_idx] = qoi
+                    self.ff [grid_idx] = ff
+                    # try:
+                    #     f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                    #     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                    #     self.qoi[grid_idx] = qoi
+                    #     self.ff [grid_idx] = ff
+                    # except:
+                    #     print("solver failed for v-space gird no %d"%(grid_idx))
+                    #     sys.exit(0)
                     
-            if csv_write==1:
-                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
-                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
-                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
-                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
+        t2 = time()
+        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
+        
+        if (self.param.export_csv ==1 or self.param.plot_data==1):
+            for grid_idx in range(n_grids):
+                dev_id   = gidx_to_device_map(grid_idx, n_grids)
+                ff       = self.ff[grid_idx]
+                qoi      = self.qoi[grid_idx]
                 
-                for col_idx, g in enumerate(self.param.collisions):
-                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
-                    
-            plot_data    = self.param.plot_data
-            if plot_data:
+                def asnumpy(a):
+                    if cp.get_array_module(a)==cp:
+                        with cp.cuda.Device(dev_id):
+                            return cp.asnumpy(a)
+                    else:
+                        return a
                 
-                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                ff_cpu   = asnumpy(ff)
+                ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+                ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff_cpu)
                 
-                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                n0    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "n0"))
+                ne    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ne"))
+                ni    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ni"))
+                Tg    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg"))
+                eRe   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe"))
+                eIm   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
                 eMag  = np.sqrt(eRe**2 + eIm**2)
                 
-                num_sh       = len(self.bte_solver._par_lm[grid_idx])
-                num_subplots = num_sh 
-                num_plt_cols = min(num_sh, 4)
-                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-                plt_idx      =  1
-                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
-
-                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
-                        fr = np.abs(ff_r[ii, lm_idx, :])
-                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                if csv_write:
+                    data_csv[gidx_to_pidx_map[grid_idx], 0]    = n0
+                    data_csv[gidx_to_pidx_map[grid_idx], 1]    = ne
+                    data_csv[gidx_to_pidx_map[grid_idx], 2]    = ni
+                    data_csv[gidx_to_pidx_map[grid_idx], 3]    = Tg
+                    data_csv[gidx_to_pidx_map[grid_idx], 4]    = eMag
+                    data_csv[gidx_to_pidx_map[grid_idx], 5]    = asnumpy(qoi["energy"])
+                    data_csv[gidx_to_pidx_map[grid_idx], 6]    = asnumpy(qoi["mobility"])
+                    data_csv[gidx_to_pidx_map[grid_idx], 7]    = asnumpy(qoi["diffusion"])
                     
-                    plt.xlabel(r"energy (eV)")
-                    plt.ylabel(r"$f_%d$"%(lm[0]))
-                    plt.grid(visible=True)
-                    if lm_idx==0:
-                        plt.legend(prop={'size': 6})
+                    for col_idx, g in enumerate(self.param.collisions):
+                        data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
+
+                if plot_data:
+                    num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                    num_subplots = num_sh 
+                    num_plt_cols = min(num_sh, 4)
+                    num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                    fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                    plt_idx      =  1
+                    n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
+
+                    for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                        plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                        for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                            fr = np.abs(ff_r[ii, lm_idx, :])
+                            plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
                         
-                    plt_idx +=1
-                
-                #plt_idx = num_sh
-                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
-                plt.close()
-        
-        t2 = time()
-        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
-        
-        if csv_write:
-            fname    = self.param.out_fname
-            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                writer = csv.writer(f,delimiter=',')
-                # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
-                
-                writer.writerow(header)
-                writer.writerows(data_csv)
+                        plt.xlabel(r"energy (eV)")
+                        plt.ylabel(r"$f_%d$"%(lm[0]))
+                        plt.grid(visible=True)
+                        if lm_idx==0:
+                            plt.legend(prop={'size': 6})
+                            
+                        plt_idx +=1
+                    
+                    plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                    plt.close()
+            
+            if csv_write:
+                fname    = self.param.out_fname
+                with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                    writer = csv.writer(f,delimiter=',')
+                    # write the header
+                    header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                    for col_idx, g in enumerate(self.param.collisions):
+                        header.append(str(g))
+                    
+                    writer.writerow(header)
+                    writer.writerows(data_csv)
 
         return
     
-    def solve_with_parla(self):
-        csv_write        = self.param.export_csv
-        gidx_to_pidx_map = self.grid_idx_to_spatial_idx_map
+    def solve_w_parla(self):
+        rank                    = self.comm.Get_rank()
+        npes                    = self.comm.Get_size()
+        xp                      = self.xp_module
+        csv_write               = self.param.export_csv
+        plot_data               = self.param.plot_data
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        use_gpu                 = self.param.use_gpu
+        dev_id                  = self.param.dev_id
+        verbose                 = self.param.verbose
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        
         self.qoi         = [None for grid_idx in range(self.param.n_grids)]
         self.ff          = [None for grid_idx in range(self.param.n_grids)]
         
-        if csv_write ==1 : 
+        if csv_write: 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
         
-        
-        rank = self.comm.Get_rank()
-        npes = self.comm.Get_size()
+        self.profile_tt[pp.SETUP].start()
+        self.__efield_setup__()
+        self.profile_tt[pp.SETUP].stop()
         
         with Parla():
             num_gpus         = len(gpu)
-            grid_to_device_map = lambda gidx : gidx % num_gpus
+            if (use_gpu==1):
+                parla_placement = [gpu(gidx_to_device_map(grid_idx,n_grids)) for grid_idx in range(n_grids)]
+            else:
+                parla_placement = [cpu for grid_idx in range(n_grids)]
+            print(parla_placement)
+            
             @spawn(placement=cpu, vcus=0)
             async def __main__():
-                self.profile_tt[pp.SETUP].start()
-                ts_0 = TaskSpace("T")
-                for grid_idx in range(self.param.n_grids):
-                    @spawn(ts_0[grid_idx], placement=[cpu], vcus=0.0)
-                    def t0():
-                        print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-                        f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
-                        
-                        if self.param.use_gpu == 1:
-                            dev_id  = grid_to_device_map(grid_idx)
-                            self.bte_solver.host_to_device_setup(dev_id, grid_idx)
-                            xp      = cp
-
-                            with cp.cuda.Device(dev_id):
-                                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-            
-                                if self.param.Efreq == 0:
-                                    ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                                else:
-                                    ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                        else:
-                            xp = np
-                            eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                            eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-        
-                            if self.param.Efreq == 0:
-                                ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                            else:
-                                ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                                    
-                        self.bte_solver.set_efield_function(grid_idx, ef_t)
-                        return
-                
-                await ts_0
-                
-                self.profile_tt[pp.SETUP].stop()
-                if self.param.use_gpu==1:
-                    p1 = [gpu(grid_to_device_map(grid_idx)) for grid_idx in range(self.param.n_grids)]
-                else:
-                    p1 = [cpu for grid_idx in range(self.param.n_grids)]
-                
                 self.profile_tt[pp.SOLVE].start()
                 ts_1 = TaskSpace("T")
                 for grid_idx in range(self.param.n_grids):
-                    @spawn(ts_1[grid_idx], placement=[p1[grid_idx]], dependencies=ts_0[grid_idx], vcus=0.0)
+                    @spawn(ts_1[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
                     def t1():
                         f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, p1[grid_idx]))
+                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]))
                         try:
                             ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                             self.ff[grid_idx]  = ff
                             self.qoi[grid_idx] = qoi
                         except:
                             print("solver failed for v-space gird no %d"%(grid_idx))
-                            # self.qoi.append(None)
-                            # continue
-                            sys.exit(0)
+                            sys.exit(-1)
                             
                 await ts_1
                 self.profile_tt[pp.SOLVE].stop()
         
-        
         t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
         t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
         print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
-        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))        
-        if self.param.export_csv ==0 and self.param.plot_data==0:
-            return
+        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))
         
-        for grid_idx in range(self.param.n_grids):
-            dev_id = grid_idx % num_gpus
-            
-            if self.param.use_gpu==1:
-                gpu_id = cp.cuda.Device(dev_id)
-                gpu_id.use()
-            
-            ff       = self.ff[grid_idx]
-            ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
-            ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff)
-
-            if self.param.use_gpu==1:
-                self.bte_solver.device_to_host_setup(self.param.dev_id,grid_idx)
+        if (self.param.export_csv ==1 or self.param.plot_data==1):
+            for grid_idx in range(n_grids):
+                dev_id   = gidx_to_device_map(grid_idx, n_grids)
+                ff       = self.ff[grid_idx]
+                qoi      = self.qoi[grid_idx]
                 
-                qoi = self.qoi[grid_idx]    
-                with cp.cuda.Device(dev_id):
-                    ff_r     = cp.asnumpy(ff_r)
-                    for k, v in qoi.items():
-                        qoi[k] = cp.asnumpy(v)
-                    
-            if csv_write==1:
-                data_csv[gidx_to_pidx_map[grid_idx], 0]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                data_csv[gidx_to_pidx_map[grid_idx], 1]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                data_csv[gidx_to_pidx_map[grid_idx], 2]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                data_csv[gidx_to_pidx_map[grid_idx], 3]    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
-                data_csv[gidx_to_pidx_map[grid_idx], 4]    = np.sqrt(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")**2 + self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")**2)
-                data_csv[gidx_to_pidx_map[grid_idx], 5]    = qoi["energy"]
-                data_csv[gidx_to_pidx_map[grid_idx], 6]    = qoi["mobility"]
-                data_csv[gidx_to_pidx_map[grid_idx], 7]    = qoi["diffusion"]
-                
-                for col_idx, g in enumerate(self.param.collisions):
-                    data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = qoi["rates"][col_idx]
-
-            plot_data    = self.param.plot_data
-            if plot_data:
+                def asnumpy(a):
+                    if cp.get_array_module(a)==cp:
+                        with cp.cuda.Device(dev_id):
+                            return cp.asnumpy(a)
+                    else:
+                        return a
                 
-                n0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "n0")
-                ne    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ne")
-                ni    = self.bte_solver.get_boltzmann_parameter(grid_idx, "ni")
-                Tg    = self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg")
+                ff_cpu   = asnumpy(ff)
+                ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+                ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff_cpu)
                 
-                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                n0    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "n0"))
+                ne    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ne"))
+                ni    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ni"))
+                Tg    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg"))
+                eRe   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe"))
+                eIm   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
                 eMag  = np.sqrt(eRe**2 + eIm**2)
                 
-                num_sh       = len(self.bte_solver._par_lm[grid_idx])
-                num_subplots = num_sh 
-                num_plt_cols = min(num_sh, 4)
-                num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-                fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-                plt_idx      =  1
-                n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
-
-                for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                    plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                    for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
-                        fr = np.abs(ff_r[ii, lm_idx, :])
-                        plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                if csv_write:
+                    data_csv[gidx_to_pidx_map[grid_idx], 0]    = n0
+                    data_csv[gidx_to_pidx_map[grid_idx], 1]    = ne
+                    data_csv[gidx_to_pidx_map[grid_idx], 2]    = ni
+                    data_csv[gidx_to_pidx_map[grid_idx], 3]    = Tg
+                    data_csv[gidx_to_pidx_map[grid_idx], 4]    = eMag
+                    data_csv[gidx_to_pidx_map[grid_idx], 5]    = asnumpy(qoi["energy"])
+                    data_csv[gidx_to_pidx_map[grid_idx], 6]    = asnumpy(qoi["mobility"])
+                    data_csv[gidx_to_pidx_map[grid_idx], 7]    = asnumpy(qoi["diffusion"])
                     
-                    plt.xlabel(r"energy (eV)")
-                    plt.ylabel(r"$f_%d$"%(lm[0]))
-                    plt.grid(visible=True)
-                    if lm_idx==0:
-                        plt.legend(prop={'size': 6})
+                    for col_idx, g in enumerate(self.param.collisions):
+                        data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
+
+                if plot_data:
+                    num_sh       = len(self.bte_solver._par_lm[grid_idx])
+                    num_subplots = num_sh 
+                    num_plt_cols = min(num_sh, 4)
+                    num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+                    fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
+                    plt_idx      =  1
+                    n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
+
+                    for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                        plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                        for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
+                            fr = np.abs(ff_r[ii, lm_idx, :])
+                            plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
                         
-                    plt_idx +=1
-                
-                #plt_idx = num_sh
-                plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
-                plt.close()
-        
-        if csv_write:
-            fname    = self.param.out_fname
-            with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                writer = csv.writer(f,delimiter=',')
-                # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
-                
-                writer.writerow(header)
-                writer.writerows(data_csv)
-       
+                        plt.xlabel(r"energy (eV)")
+                        plt.ylabel(r"$f_%d$"%(lm[0]))
+                        plt.grid(visible=True)
+                        if lm_idx==0:
+                            plt.legend(prop={'size': 6})
+                            
+                        plt_idx +=1
+                    
+                    plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
+                    plt.close()
+            
+            if csv_write:
+                fname    = self.param.out_fname
+                with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
+                    writer = csv.writer(f,delimiter=',')
+                    # write the header
+                    header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                    for col_idx, g in enumerate(self.param.collisions):
+                        header.append(str(g))
+                    
+                    writer.writerow(header)
+                    writer.writerows(data_csv)
+
+        return
+    
     def push(self, interface):
         xp                = self.xp_module
         Te_bte            = xp.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
@@ -661,18 +686,18 @@ def push(self, interface):
             # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
             rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
         
-        rr_bte[rr_bte<0] = 0.0 
-        s0  = rate_tps_arr * n0 * ni
-        s1  = rate_tps_csc * n0 * ni
+        # rr_bte[rr_bte<0] = 0.0 
+        # s0  = rate_tps_arr * n0 * ni
+        # s1  = rate_tps_csc * n0 * ni
         
-        s2  = rr_bte       * n0 * ni
+        # s2  = rr_bte       * n0 * ni
         
-        # tau = 1e-2
-        # idx = s2 > tau
-        rate_bte[0][:]   =  0.0
-        rate_bte[1][:]   =  0.0
-        rate_bte[0]      = rr_bte
-        rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
+        # # tau = 1e-2
+        # # idx = s2 > tau
+        # rate_bte[0][:]   =  0.0
+        # rate_bte[1][:]   =  0.0
+        # rate_bte[0]      = rr_bte
+        # rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
         
         return 
         
@@ -707,8 +732,8 @@ def push(self, interface):
 boltzmann.grid_setup(interface)
 boltzmann.fetch(interface)
 boltzmann.solve()
-boltzmann.push(interface)
-tps.fetch(interface)
+# boltzmann.push(interface)
+# tps.fetch(interface)
 
 # while it < max_iters:
 #     tps.solveStep()

From 11e0733209bac159f9b3ce4f20785e3481b87508 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Sun, 28 Jan 2024 22:26:39 -0600
Subject: [PATCH 37/75] basic tps+ BTE coupling, main loop moved to Parla + MPI

---
 src/tps-bte_0d3v.py | 524 ++++++++++++++++++++++----------------------
 1 file changed, 265 insertions(+), 259 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 732bd41a6..021167d93 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -162,9 +162,12 @@ def __init__(self, tps, comm):
         self.profile_tt = profile_tt
         self.profile_nn = profile_nn
         
+        num_gpus_per_node = 1 
+        if self.param.use_gpu==1:
+            num_gpus_per_node = cp.cuda.runtime.getDeviceCount()
+        
         # how to map each grid to the GPU devices on the node
-        self.gidx_to_device_map = lambda gidx, num_grids : gidx % 2
-
+        self.gidx_to_device_map = lambda gidx, num_grids : gidx % num_gpus_per_node
         return
     
     def __parse_config_file__(self, fname):
@@ -270,129 +273,12 @@ def grid_setup(self, interface):
             assert self.grid_idx_to_npts[grid_idx] > 0
 
             print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-            f0 = self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian")
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "f0", f0)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian"))
             
-            print("setting grid %d to device %d"%(grid_idx, gidx_to_device_map(grid_idx, n_grids)))
-            self.bte_solver.host_to_device_setup(gidx_to_device_map(grid_idx, n_grids), grid_idx)
-            self.xp_module = cp
         
         self.profile_tt[pp.SETUP].stop()
         return
 
-    def __efield_setup__(self):
-        
-        """
-        Here we set the E-field for 1-way coupling
-        """
-        
-        use_gpu   = self.param.use_gpu
-        n_grids   = self.param.n_grids
-        xp        = self.xp_module
-        if(use_gpu==1):
-            def Et(t, grid_idx):
-                dev_id = self.gidx_to_device_map(grid_idx, n_grids)
-                with cp.cuda.Device(dev_id):
-                    eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                    eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-
-                    if self.param.Efreq == 0:
-                        return xp.sqrt(eRe_d**2 + eIm_d**2)
-                    else:
-                        return eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-        else:
-            def Et(t, grid_idx):
-                eRe_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                eIm_d     = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-
-                if self.param.Efreq == 0:
-                    return xp.sqrt(eRe_d**2 + eIm_d**2)
-                else:
-                    return eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                            
-        for grid_idx in range(n_grids):
-            et = lambda t, gid=grid_idx: Et(t, gid)
-            self.bte_solver.set_efield_function(grid_idx, et)   
-        
-        return
-        
-    def fetch(self, interface):
-        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
-        
-        heavy_temp        = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
-        tps_npts          = len(heavy_temp)
-        self.tps_npts     = tps_npts
-        
-        electron_temp     = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
-        efield            = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
-        species_densities = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
-        n_grids           = self.param.n_grids 
-        use_gpu           = self.param.use_gpu
-        
-        for grid_idx in range(n_grids):
-            bte_idx           = gidx_to_pidx_map[grid_idx]
-            dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
-            
-            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
-            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
-            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
-            Tg                = heavy_temp[bte_idx]
-            Te                = electron_temp[bte_idx]
-            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
-            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
-            
-            eMag              = np.sqrt(eRe**2 + eIm **2)
-            eByn0             = eMag/n0/self.param.Td_fac
-            
-            if self.param.verbose == 1 :
-                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
-                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
-                
-                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
-                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
-                
-                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
-                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
-                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
-            
-            if (use_gpu == 1):
-                with cp.cuda.Device(dev_id):
-                    ne        = cp.array(ne)
-                    ni        = cp.array(ni)
-                    n0        = cp.array(n0)
-                    Tg        = cp.array(Tg)
-                    Te        = cp.array(Te)
-                    eRe       = cp.array(eRe)
-                    eIm       = cp.array(eIm)
-                     
-                    eMag      = cp.sqrt(eRe**2 + eIm **2)
-                    eByn0     = eMag/n0/self.param.Td_fac
-            
-            #self.bte_solver.set_boltzmann_parameters(grid_idx, n0, ne, ni, Tg, self.param.solver_type)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0", n0)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne", ne)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni", ni)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg", Tg)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eRe)
-            
-        return        
-
-    def solve(self):
-        """
-        perform the BTE solve, supports both stead-state solution (static E-field) 
-        and time-periodic solutions for the oscillatory E-fields
-        """
-        
-        if WITH_PARLA==1:
-            self.solve_w_parla()
-            return
-        else:
-            self.solve_wo_parla()
-            return
-        
     def solve_wo_parla(self):
         xp                      = self.xp_module
         csv_write               = self.param.export_csv
@@ -410,43 +296,28 @@ def solve_wo_parla(self):
         if csv_write: 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
         
-        self.__efield_setup__()
-        
         t1 = time()
         for grid_idx in range(n_grids):
             dev_id   = gidx_to_device_map(grid_idx, n_grids)
             if (use_gpu==0):
                 try:
-                    eRe_d    = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
-                    eIm_d    = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
-                    if self.param.Efreq == 0:
-                        ef_t = lambda t : xp.sqrt(eRe_d**2 + eIm_d**2)
-                    else:
-                        ef_t = lambda t : eRe_d * xp.cos(2 * xp.pi * self.param.Efreq * t) + eIm_d * xp.sin(2 * xp.pi * self.param.Efreq * t)
-                    
-                    self.bte_solver.set_efield_function(grid_idx, ef_t)
-                    
-                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
+                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
                     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                     self.qoi[grid_idx] = qoi
                     self.ff [grid_idx] = ff
                 except:
                     print("solver failed for v-space gird no %d"%(grid_idx))
-                    sys.exit(0)
+                    sys.exit(-1)
             else:
                 with xp.cuda.Device(dev_id):
-                    f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-                    ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                    self.qoi[grid_idx] = qoi
-                    self.ff [grid_idx] = ff
-                    # try:
-                    #     f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-                    #     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                    #     self.qoi[grid_idx] = qoi
-                    #     self.ff [grid_idx] = ff
-                    # except:
-                    #     print("solver failed for v-space gird no %d"%(grid_idx))
-                    #     sys.exit(0)
+                    try:
+                        f0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                        ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                        self.qoi[grid_idx] = qoi
+                        self.ff [grid_idx] = ff
+                    except:
+                        print("solver failed for v-space gird no %d"%(grid_idx))
+                        sys.exit(-1)
                     
         t2 = time()
         print("time for boltzmann v-space solve = %.4E"%(t2- t1))
@@ -529,7 +400,157 @@ def asnumpy(a):
 
         return
     
-    def solve_w_parla(self):
+    async def fetch(self, interface):
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts                = len(heavy_temp)
+        self.tps_npts           = tps_npts
+        electron_temp           = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
+        species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
+        
+        n_grids                 = self.param.n_grids 
+        use_gpu                 = self.param.use_gpu
+        n_grids                 = self.param.n_grids
+        
+        gidx_to_device_map      = self.gidx_to_device_map
+        
+        for grid_idx in range(n_grids):
+            bte_idx           = gidx_to_pidx_map[grid_idx]
+            dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+            
+            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
+            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
+            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
+            Tg                = heavy_temp[bte_idx]
+            Te                = electron_temp[bte_idx]
+            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
+            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
+            
+            eMag              = np.sqrt(eRe**2 + eIm **2)
+            eByn0             = eMag/n0/self.param.Td_fac
+            
+            if self.param.verbose == 1 :
+                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                
+                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
+                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
+                
+                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
+                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
+            
+            
+            if (use_gpu == 1):
+                with cp.cuda.Device(dev_id):
+                    ne        = cp.array(ne)
+                    ni        = cp.array(ni)
+                    n0        = cp.array(n0)
+                    Tg        = cp.array(Tg)
+                    Te        = cp.array(Te)
+                    eRe       = cp.array(eRe)
+                    eIm       = cp.array(eIm)
+                    
+                    eMag      = cp.sqrt(eRe**2 + eIm **2)
+                    eByn0     = eMag/n0/self.param.Td_fac
+            
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
+            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eIm)
+            self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , eMag)
+        
+            
+        return        
+
+    async def solve_init(self):
+        rank                    = self.comm.Get_rank()
+        npes                    = self.comm.Get_size()
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        
+        ts = TaskSpace("T")
+        for grid_idx in range(self.param.n_grids):
+            @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+            def t1():
+                dev_id = gidx_to_device_map(grid_idx, n_grids)
+                print("[%d/%d] setting grid %d to device %d"%(rank, npes, grid_idx, dev_id))
+                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+            
+        await ts
+        
+        def ts_op_setup(grid_idx):
+            xp                                      = self.xp_module 
+            f_mw                                    = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+            n_pts                                   = f_mw.shape[1]
+            Qmat                                    = self.bte_solver._op_qmat[grid_idx]
+            INr                                     = xp.eye(Qmat.shape[1])
+            self.bte_solver._op_imat_vx[grid_idx]   = xp.einsum("i,jk->ijk",xp.ones(n_pts), INr)
+            
+        if(self.param.use_gpu==1):
+            self.xp_module = cp
+            ts = TaskSpace("T")
+            
+            for grid_idx in range(self.param.n_grids):
+                dev_id = gidx_to_device_map(grid_idx, n_grids)
+                @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
+                def t1():
+                    ts_op_setup(grid_idx)
+                    f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+                    
+            
+            await ts
+        else:
+            self.xp_module = np
+            ts = TaskSpace("T")
+            for grid_idx in range(self.param.n_grids):
+                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                def t1():
+                    ts_op_setup(grid_idx)
+                    f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+            
+            await ts
+        
+        return
+            
+    async def solve_step(self, time, delta_t):
+        """
+        perform a single timestep in 0d-BTE
+        """
+        rank                    = self.comm.Get_rank()
+        npes                    = self.comm.Get_size()
+        # xp                      = self.xp_module
+        # csv_write               = self.param.export_csv
+        # plot_data               = self.param.plot_data
+        # gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        # use_gpu                 = self.param.use_gpu
+        # dev_id                  = self.param.dev_id
+        # verbose                 = self.param.verbose
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        
+        ts = TaskSpace("T")
+        for grid_idx in range(n_grids):
+            @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+            def t1():
+                u0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                v     = self.bte_solver.step(grid_idx, u0, self.param.atol, self.param.rtol, self.param.max_iter, time, delta_t)
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "u1", v)
+        await ts
+        
+        return 
+    
+    async def solve(self):
+        """
+        Can be used to compute steady-state or cycle averaged BTE solutions
+        """
         rank                    = self.comm.Get_rank()
         npes                    = self.comm.Get_size()
         xp                      = self.xp_module
@@ -542,43 +563,36 @@ def solve_w_parla(self):
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
         
-        self.qoi         = [None for grid_idx in range(self.param.n_grids)]
-        self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        self.qoi                = [None for grid_idx in range(self.param.n_grids)]
+        self.ff                 = [None for grid_idx in range(self.param.n_grids)]
+        num_gpus                = len(gpu)
         
+        if (use_gpu==1):
+            parla_placement = [gpu(gidx_to_device_map(grid_idx,n_grids)) for grid_idx in range(n_grids)]
+        else:
+            parla_placement = [cpu for grid_idx in range(n_grids)]
+
         if csv_write: 
             data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
-        
-        self.profile_tt[pp.SETUP].start()
-        self.__efield_setup__()
-        self.profile_tt[pp.SETUP].stop()
-        
-        with Parla():
-            num_gpus         = len(gpu)
-            if (use_gpu==1):
-                parla_placement = [gpu(gidx_to_device_map(grid_idx,n_grids)) for grid_idx in range(n_grids)]
-            else:
-                parla_placement = [cpu for grid_idx in range(n_grids)]
-            print(parla_placement)
             
-            @spawn(placement=cpu, vcus=0)
-            async def __main__():
-                self.profile_tt[pp.SOLVE].start()
-                ts_1 = TaskSpace("T")
-                for grid_idx in range(self.param.n_grids):
-                    @spawn(ts_1[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
-                    def t1():
-                        f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f0")
-                        print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]))
-                        try:
-                            ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
-                            self.ff[grid_idx]  = ff
-                            self.qoi[grid_idx] = qoi
-                        except:
-                            print("solver failed for v-space gird no %d"%(grid_idx))
-                            sys.exit(-1)
-                            
-                await ts_1
-                self.profile_tt[pp.SOLVE].stop()
+        self.profile_tt[pp.SOLVE].start()
+        ts = TaskSpace("T")
+        for grid_idx in range(self.param.n_grids):
+            @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
+            def t1():
+                try:
+                    print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]))
+                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                    self.ff[grid_idx]  = ff
+                    self.qoi[grid_idx] = qoi
+                except:
+                    print("solver failed for v-space gird no %d"%(grid_idx))
+                    sys.exit(-1)
+                    
+        await ts
+        self.profile_tt[pp.SOLVE].stop()
+        
         
         t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
         t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
@@ -663,90 +677,82 @@ def asnumpy(a):
 
         return
     
-    def push(self, interface):
-        xp                = self.xp_module
-        Te_bte            = xp.array(interface.HostWrite(libtps.t2bIndex.ElectronTemperature), copy=False)
-        rate_bte          = xp.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((2, self.tps_npts))
-        Te_tps            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
-        
-        species_densities = xp.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, self.tps_npts)
-        ni                = species_densities[TPSINDEX.ION_IDX]
-        n0                = species_densities[TPSINDEX.NEU_IDX]
-        ne                = species_densities[TPSINDEX.ELE_IDX]
-        
-        rate_tps_arr      = r_arr(Te_tps)
-        rate_tps_csc      = r_csc(Te_tps)
-        
-        rr_bte            = xp.zeros_like(rate_tps_arr) 
-        gidx_to_pidx_map  = self.grid_idx_to_spatial_idx_map
-        
-        for grid_idx in range(self.param.n_grids):
-            Te_bte[gidx_to_pidx_map[grid_idx]]        = (self.qoi[grid_idx]["energy"]/1.5) * self.param.ev_to_K
-            rr                                        = self.qoi[grid_idx]["rates"]
-            # here rr should be in the same ordering as the collision model prescribed to the Boltzmann solver. 
-            rr_bte[gidx_to_pidx_map[grid_idx]] = rr[1]
-        
-        # rr_bte[rr_bte<0] = 0.0 
-        # s0  = rate_tps_arr * n0 * ni
-        # s1  = rate_tps_csc * n0 * ni
+    async def push(self, interface):
+        xp                      = self.xp_module
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
         
-        # s2  = rr_bte       * n0 * ni
+        heavy_temp  = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts    = len(heavy_temp)
         
-        # # tau = 1e-2
-        # # idx = s2 > tau
-        # rate_bte[0][:]   =  0.0
-        # rate_bte[1][:]   =  0.0
-        # rate_bte[0]      = rr_bte
-        # rate_bte[1]      = xp.abs(s2-s1)/xp.max(s2)
+        n_reactions = interface.nComponents(libtps.t2bIndex.ReactionRates)
+        rates       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((n_reactions, tps_npts))
         
+        if(n_reactions>0):
+            ts = TaskSpace("T")
+            for grid_idx in range(n_grids):
+                @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                def t1():
+                    qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
+                    u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                    
+                    h_curr   = xp.dot(qA, u0)
+                    h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+                    qoi      = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                    
+                    rates[0][gidx_to_pidx_map[grid_idx]] = xp.asnumpy(qoi["rates"][1])
+                    
+                    
+            await ts
         return 
         
-
-
-
-
-comm = MPI.COMM_WORLD
-# TPS solver
-tps = libtps.Tps(comm)
-
-tps.parseCommandLineArgs(sys.argv)
-tps.parseInput()
-tps.chooseDevices()
-tps.chooseSolver()
-tps.initialize()
-
-boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
-
-interface = libtps.Tps2Boltzmann(tps)
-tps.initInterface(interface)
-
-coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
-print(coords.shape)
-
-it = 0
-max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
-print("Max Iters: ", max_iters)
-tps.solveBegin()
-tps.solveStep()
-tps.push(interface)
-boltzmann.grid_setup(interface)
-boltzmann.fetch(interface)
-boltzmann.solve()
-# boltzmann.push(interface)
-# tps.fetch(interface)
-
-# while it < max_iters:
-#     tps.solveStep()
-#     tps.push(interface)
-#     boltzmann.fetch(interface)
-#     boltzmann.solve()
-#     boltzmann.push(interface)
-#     tps.fetch(interface)
+if __name__=="__main__":
+    comm = MPI.COMM_WORLD
     
-#     it = it+1
-#     print("it, ", it)
-
-tps.solveEnd()
-
+    with Parla():
+        # TPS solver
+        tps = libtps.Tps(comm)
+        tps.parseCommandLineArgs(sys.argv)
+        tps.parseInput()
+        tps.chooseDevices()
+        tps.chooseSolver()
+        tps.initialize()
+
+        boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
+        interface = libtps.Tps2Boltzmann(tps)
+        tps.initInterface(interface)
+
+        #coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
+        tps.solveBegin()
+        tps.push(interface)
+        boltzmann.grid_setup(interface)
+        
+        max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
+        iter      = 0
+        tt        = 0
+        dt        = 1e-3 /boltzmann.param.Efreq
+    
+        @spawn(placement=cpu, vcus=0)
+        async def __main__():
+            await boltzmann.solve_init()    
+            xp = boltzmann.bte_solver.xp_module
+    
+            
+            while (iter<max_iters):
+                tps.solveStep()
+                
+                tps.push(interface)
+                await boltzmann.fetch(interface)
+                
+                await boltzmann.solve_step(tt, dt)
+                for grid_idx in range(boltzmann.param.n_grids):
+                    u1 = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
+                    boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", u1)
+                
+                await boltzmann.push(interface)
+                tps.fetch(interface)
+        
 
-sys.exit (tps.getStatus())
+    tps.solveEnd()
+    sys.exit (tps.getStatus())
\ No newline at end of file

From 5bdcaf7327243897db6a6fd3d3e2389cda8d2a66 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Mon, 29 Jan 2024 09:34:59 -0600
Subject: [PATCH 38/75] timing results for BTE and TPS steps added.

---
 src/tps-bte_0d3v.py | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 021167d93..f485e809e 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -430,18 +430,18 @@ async def fetch(self, interface):
             eMag              = np.sqrt(eRe**2 + eIm **2)
             eByn0             = eMag/n0/self.param.Td_fac
             
-            if self.param.verbose == 1 :
-                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
-                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+            # if self.param.verbose == 1 :
+            #     print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+            #     print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+            #     print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                 
-                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-                print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
-                print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
+            #     print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+            #     print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
+            #     print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
                 
-                print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
-                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
-                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
+            #     print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+            #     print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
+            #     print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
             
             
             if (use_gpu == 1):
@@ -728,30 +728,40 @@ def t1():
         tps.push(interface)
         boltzmann.grid_setup(interface)
         
-        max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
-        iter      = 0
-        tt        = 0
-        dt        = 1e-3 /boltzmann.param.Efreq
-    
         @spawn(placement=cpu, vcus=0)
         async def __main__():
             await boltzmann.solve_init()    
             xp = boltzmann.bte_solver.xp_module
-    
+
+            max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
+            iter      = 0
+            tt        = 0
+            tau       = (1/boltzmann.param.Efreq)
+            dt        = 5e-3 * tau
             
             while (iter<max_iters):
+                t1 = time()
                 tps.solveStep()
+                t2 = time()
+                
+                t1 = min_mean_max(t2-t1, comm)
+                print("[TPS] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt/tau, t1[0],t1[1],t1[2]))
                 
                 tps.push(interface)
                 await boltzmann.fetch(interface)
                 
+                t1 = time()
                 await boltzmann.solve_step(tt, dt)
                 for grid_idx in range(boltzmann.param.n_grids):
                     u1 = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
                     boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", u1)
+                t2 = time()
+                t1 = min_mean_max(t2-t1, comm)
+                print("[BTE] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt/tau, t1[0],t1[1],t1[2]))
                 
                 await boltzmann.push(interface)
                 tps.fetch(interface)
+                tt += dt
         
 
     tps.solveEnd()

From d7fbc5aae106cd33493cb296d4abf85bc554ed27 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 29 Jan 2024 09:57:46 -0600
Subject: [PATCH 39/75] Export timestep and currentTime

---
 src/M2ulPhyS2Boltzmann.cpp | 3 +++
 src/tps2Boltzmann.cpp      | 2 ++
 src/tps2Boltzmann.hpp      | 8 ++++++++
 3 files changed, 13 insertions(+)

diff --git a/src/M2ulPhyS2Boltzmann.cpp b/src/M2ulPhyS2Boltzmann.cpp
index 60c9f450b..a8aba27b6 100644
--- a/src/M2ulPhyS2Boltzmann.cpp
+++ b/src/M2ulPhyS2Boltzmann.cpp
@@ -78,6 +78,9 @@ void M2ulPhyS::push(TPS::Tps2Boltzmann &interface) {
   interface.interpolateFromNativeFES(*heavyTemperature, TPS::Tps2Boltzmann::Index::HeavyTemperature);
   interface.interpolateFromNativeFES(*electronTemperature, TPS::Tps2Boltzmann::Index::ElectronTemperature);
 
+  interface.setTimeStep(this->dt);
+  interface.setCurrentTime(this->time);
+
   delete species;
   delete heavyTemperature;
   delete electronTemperature;
diff --git a/src/tps2Boltzmann.cpp b/src/tps2Boltzmann.cpp
index b2cceba63..76f97c28c 100644
--- a/src/tps2Boltzmann.cpp
+++ b/src/tps2Boltzmann.cpp
@@ -342,6 +342,8 @@ void tps2bolzmann(py::module &m) {
              return std::unique_ptr<TPS::CPUData>(new TPS::CPUData(interface.Field(index), true));
            })
       .def("EfieldAngularFreq", &TPS::Tps2Boltzmann::EfieldAngularFreq)
+      .def("timeStep", &TPS::Tps2Boltzmann::timeStep)
+      .def("currentTime", &TPS::Tps2Boltzmann::currentTime)
       .def("Nspecies", &TPS::Tps2Boltzmann::Nspecies)
       .def("NeFiledComps", &TPS::Tps2Boltzmann::NeFieldComps)
       .def("nComponents", &TPS::Tps2Boltzmann::nComponents)
diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index 82d4e8ee7..301d9ada5 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -116,6 +116,12 @@ class Tps2Boltzmann {
   int nComponents(Index index) const { return ncomps[index]; }
   std::string getReactionEquation(int index) const { return reaction_eqs_[index]; }
 
+  void setTimeStep(double dt) { timestep_=dt; }
+  void setCurrentTime(double time) { currentTime_=time; }
+
+  double timeStep() const { return timestep_; }
+  double currentTime() const { return currentTime_; }
+
   void saveDataCollection(int cycle, double time);
 
   ~Tps2Boltzmann();
@@ -161,6 +167,8 @@ class Tps2Boltzmann {
   mfem::ParGridFunction *spatial_coordinates_;
 
   double EfieldAngularFreq_;
+  double timestep_;
+  double currentTime_;
 
   bool save_to_paraview_dc;
   mfem::ParaViewDataCollection *paraview_dc;

From ce6c857badfc1b116cbfb72dfade6a3eab8a7219 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 29 Jan 2024 10:34:39 -0600
Subject: [PATCH 40/75] Make style

---
 src/tps2Boltzmann.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tps2Boltzmann.hpp b/src/tps2Boltzmann.hpp
index 301d9ada5..a0d041492 100644
--- a/src/tps2Boltzmann.hpp
+++ b/src/tps2Boltzmann.hpp
@@ -116,8 +116,8 @@ class Tps2Boltzmann {
   int nComponents(Index index) const { return ncomps[index]; }
   std::string getReactionEquation(int index) const { return reaction_eqs_[index]; }
 
-  void setTimeStep(double dt) { timestep_=dt; }
-  void setCurrentTime(double time) { currentTime_=time; }
+  void setTimeStep(double dt) { timestep_ = dt; }
+  void setCurrentTime(double time) { currentTime_ = time; }
 
   double timeStep() const { return timestep_; }
   double currentTime() const { return currentTime_; }

From b6f246083c52382856fdbba2459f7f69cc463b22 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Tue, 6 Feb 2024 10:22:26 -0600
Subject: [PATCH 41/75] basic tps + 0d bte 2-way coupling with parla

---
 src/tps-bte_0d3v.py | 131 +++++++++++++++++++++++++++++++-------------
 1 file changed, 93 insertions(+), 38 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index f485e809e..123cff0a8 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -12,7 +12,7 @@
 import enum
 import pandas as pd
 import scipy.interpolate
-
+import scipy.cluster
 class profile_t:
     def __init__(self,name):
         self.name = name
@@ -217,32 +217,43 @@ def grid_setup(self, interface):
         self.profile_tt[pp.SETUP].start()
         
         xp                = self.xp_module
+        n_grids           = self.param.n_grids
         Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
-        Te_min, Te_max    = xp.min(Te), xp.max(Te)
-        Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
-        dist_mat          = xp.zeros((len(Te), self.param.n_grids))
-        
-        for iter in range(50):
-            #print("clustering iteration ", iter, Te_b)
-            for i in range(self.param.n_grids):
-                dist_mat[:,i] = xp.abs(Te-Te_b[i])
+        # Te_min, Te_max    = xp.min(Te), xp.max(Te)
+        # Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
+        # dist_mat          = xp.zeros((len(Te), self.param.n_grids))
+        
+        # #scipy.cluster.vq.kmeans(scipy.cluster.vq.whiten(Te), Te_b, ) 
+        
+        # for iter in range(50):
+        #     #print("clustering iteration ", iter, Te_b)
+        #     for i in range(self.param.n_grids):
+        #         dist_mat[:,i] = xp.abs(Te-Te_b[i])
             
-            membership = xp.argmin(dist_mat, axis=1)
-            Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
-            rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
-            Te_b       = Te_b1
+        #     membership = xp.argmin(dist_mat, axis=1)
+        #     Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
+        #     rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
+        #     Te_b       = Te_b1
            
-            if rel_error < 1e-4:
-                break
-        Te_b = np.sort(Te_b)
+        #     if rel_error < 1e-4:
+        #         break
+        # Te_b = np.sort(Te_b)
+        # print("K-means Te clusters ", Te_b)                
+        # for i in range(self.param.n_grids):
+        #     dist_mat[:,i] = xp.abs(Te-Te_b[i])
+        
+        Tew           = scipy.cluster.vq.whiten(Te)
+        Tecw          = scipy.cluster.vq.kmeans(Tew, np.linspace(np.min(Tew), np.max(Tew), n_grids), iter=1000, thresh=1e-8)[0]
+        Te_b          = Tecw * np.std(Te, axis=0)
+        dist_mat      = xp.zeros((len(Te),n_grids))
+        
         print("K-means Te clusters ", Te_b)                
         for i in range(self.param.n_grids):
-            dist_mat[:,i] = xp.abs(Te-Te_b[i])
+            dist_mat[:,i] = xp.abs(Tew-Tecw[i])
         
         membership = xp.argmin(dist_mat, axis=1)
         grid_idx_to_spatial_pts_map = list()
         for b_idx in range(self.param.n_grids):
-            #grid_idx_to_spatial_pts_map.append(xp.argwhere(xp.logical_and(Te>= Te_b[b_idx], Te < Te_b[b_idx+1]))[:,0]) 
             grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
         
         np.save("%s_gidx_to_pidx.npy"%(self.param.out_fname), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
@@ -256,7 +267,7 @@ def grid_setup(self, interface):
         Te                               = xp.array([Te_b[b_idx]  for b_idx in range(self.param.n_grids)]) # xp.ones(self.param.n_grids) * self.param.Te 
         vth                              = np.sqrt(2* self.param.kB * Te * self.param.ev_to_K  /self.param.me)
         ev_max                           = (6 * vth / self.param.c_gamma)**2 
-        self.bte_solver                  = BoltzmannSolver(self.param, ev_max ,Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
+        self.bte_solver                  = BoltzmannSolver(self.param, ev_max , Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
 
         if self.param.verbose==1:
             print("grid energy max (eV) \n", ev_max, flush = True)
@@ -409,6 +420,15 @@ async def fetch(self, interface):
         efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
         species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
         
+        # np.save("n0.npy", species_densities[TPSINDEX.NEU_IDX])
+        # np.save("ne.npy", species_densities[TPSINDEX.ELE_IDX])
+        # np.save("ni.npy", species_densities[TPSINDEX.ION_IDX])
+        
+        # np.save("Te.npy", heavy_temp)
+        # np.save("Tg.npy", heavy_temp)
+        # np.save("E.npy" , np.sqrt(efield[0]**2 + efield[1]**2))
+        # sys.exit(-1)
+        
         n_grids                 = self.param.n_grids 
         use_gpu                 = self.param.use_gpu
         n_grids                 = self.param.n_grids
@@ -430,18 +450,18 @@ async def fetch(self, interface):
             eMag              = np.sqrt(eRe**2 + eIm **2)
             eByn0             = eMag/n0/self.param.Td_fac
             
-            # if self.param.verbose == 1 :
-            #     print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
-            #     print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-            #     print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+            if self.param.verbose == 1 :
+                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                 
-            #     print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
+                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
             #     print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
             #     print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
                 
             #     print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
-            #     print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
-            #     print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
+                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
+                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
             
             
             if (use_gpu == 1):
@@ -695,7 +715,7 @@ async def push(self, interface):
                 @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                 def t1():
                     qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
-                    u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                    u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
                     
                     h_curr   = xp.dot(qA, u0)
                     h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
@@ -705,6 +725,7 @@ def t1():
                     
                     
             await ts
+            rates = rates.reshape((-1))
         return 
         
 if __name__=="__main__":
@@ -735,10 +756,15 @@ async def __main__():
 
             max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
             iter      = 0
-            tt        = 0
+            tt        = 0#interface.currentTime()
             tau       = (1/boltzmann.param.Efreq)
-            dt        = 5e-3 * tau
+            dt_tps    = interface.timeStep()
+            dt_bte    = boltzmann.param.dt * (dt_tps)
+            bte_steps = int(dt_tps/dt_bte)
+            n_grids   = boltzmann.param.n_grids
             
+            cycle_freq = int(xp.ceil(tau/dt_tps))
+            gidx_to_device_map = boltzmann.gidx_to_device_map
             while (iter<max_iters):
                 t1 = time()
                 tps.solveStep()
@@ -750,18 +776,47 @@ async def __main__():
                 tps.push(interface)
                 await boltzmann.fetch(interface)
                 
-                t1 = time()
-                await boltzmann.solve_step(tt, dt)
-                for grid_idx in range(boltzmann.param.n_grids):
-                    u1 = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
-                    boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", u1)
-                t2 = time()
-                t1 = min_mean_max(t2-t1, comm)
-                print("[BTE] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt/tau, t1[0],t1[1],t1[2]))
+                tt_bte       = tt
+                u_avg        = [0 for i in range(n_grids)] 
+                cycle_f1     = (0.5 * dt_bte/ dt_tps)
+                
+                for bte_step_idx in range(bte_steps):
+                
+                    ts = TaskSpace("T")
+                    for grid_idx in range(n_grids):
+                        @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                        def t1():
+                            u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                    await ts
+                    
+                    t1 = time()
+                    await boltzmann.solve_step(tt_bte, dt_bte)
+                    t2 = time()
+                    t1 = min_mean_max(t2-t1, comm)
+                    print("[BTE] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt_bte/tau, t1[0],t1[1],t1[2]))
+                    
+                    ts = TaskSpace("T")
+                    for grid_idx in range(n_grids):
+                        @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                        def t1():
+                            u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
+                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1"))
+                    await ts
+                
+                    tt_bte += dt_bte
                 
+                ts = TaskSpace("T")
+                for grid_idx in range(n_grids):
+                    @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                    def t1():
+                        boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
+                await ts
                 await boltzmann.push(interface)
+                if (iter%cycle_freq==0):
+                    interface.saveDataCollection(cycle=(iter//cycle_freq), time=tt/tau)
                 tps.fetch(interface)
-                tt += dt
+                tt += dt_tps
+                iter+=1
         
 
     tps.solveEnd()

From 97612079ed17395899d7dc60f4c86066bf1b2612 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 14 Feb 2024 13:52:15 -0600
Subject: [PATCH 42/75] hierarcical clustering added for the bte solve, active
 grid select based on ionization threshold

---
 src/tps-bte_0d3v.py | 407 ++++++++++++++++++++++++++++++--------------
 1 file changed, 278 insertions(+), 129 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 123cff0a8..1d8e29095 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -94,6 +94,7 @@ class BoltzmannSolverParams():
     l_max         = 1           # spherical modes uses, 0, to l_max
     ev_max        = 16          # v-space grid truncation (eV)
     n_grids       = 4           # number of v-space grids
+    n_sub_clusters= 300         # number of sub-clusters
 
     dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
     cycles        = 10             # number of max cycles to evolve
@@ -126,6 +127,10 @@ class BoltzmannSolverParams():
     c_gamma       = np.sqrt(2 * scipy.constants.elementary_charge / scipy.constants.electron_mass) #[(C/kg)^{1/2}]
     me            = scipy.constants.electron_mass
     kB            = scipy.constants.Boltzmann
+    N_Avo         = scipy.constants.Avogadro
+    
+    n0            = 3.22e22 #[m^{-3}]
+    
     
 class TPSINDEX():
     """
@@ -138,6 +143,11 @@ class TPSINDEX():
     EF_RE_IDX = 0                       # Re(E) index
     EF_IM_IDX = 1                       # Im(E) index
     
+    # in future we need to setup this methodically
+    # here key denotes the idx running from 0, nreactions-1
+    # value denotes the reaction index in the qoi array
+    RR_IDX   = {0:1}                    
+    
 class Boltzmann0D2VBactchedSolver:
     
     def __init__(self, tps, comm):
@@ -216,34 +226,12 @@ def grid_setup(self, interface):
         assert self.xp_module==np, "grid setup only supported in CPU"
         self.profile_tt[pp.SETUP].start()
         
-        xp                = self.xp_module
-        n_grids           = self.param.n_grids
-        Te                = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
-        # Te_min, Te_max    = xp.min(Te), xp.max(Te)
-        # Te_b              = xp.linspace(Te_min, Te_max, self.param.n_grids, endpoint=False)
-        # dist_mat          = xp.zeros((len(Te), self.param.n_grids))
-        
-        # #scipy.cluster.vq.kmeans(scipy.cluster.vq.whiten(Te), Te_b, ) 
-        
-        # for iter in range(50):
-        #     #print("clustering iteration ", iter, Te_b)
-        #     for i in range(self.param.n_grids):
-        #         dist_mat[:,i] = xp.abs(Te-Te_b[i])
-            
-        #     membership = xp.argmin(dist_mat, axis=1)
-        #     Te_b1      = np.array([np.mean(Te[xp.argwhere(membership==i)[:,0]]) for i in range(self.param.n_grids)])
-        #     rel_error  = np.max(np.abs(1 - Te_b1/Te_b))
-        #     Te_b       = Te_b1
-           
-        #     if rel_error < 1e-4:
-        #         break
-        # Te_b = np.sort(Te_b)
-        # print("K-means Te clusters ", Te_b)                
-        # for i in range(self.param.n_grids):
-        #     dist_mat[:,i] = xp.abs(Te-Te_b[i])
+        xp            = self.xp_module
+        n_grids       = self.param.n_grids
+        Te            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         
         Tew           = scipy.cluster.vq.whiten(Te)
-        Tecw          = scipy.cluster.vq.kmeans(Tew, np.linspace(np.min(Tew), np.max(Tew), n_grids), iter=1000, thresh=1e-8)[0]
+        Tecw          = scipy.cluster.vq.kmeans(Tew, np.linspace(np.min(Tew), np.max(Tew), n_grids), iter=1000, thresh=1e-8, check_finite=False)[0]
         Te_b          = Tecw * np.std(Te, axis=0)
         dist_mat      = xp.zeros((len(Te),n_grids))
         
@@ -285,8 +273,17 @@ def grid_setup(self, interface):
 
             print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
             self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian"))
-            
         
+        
+        active_grid_idx=list()
+        for grid_idx in range(n_grids):
+            spec_sp     = self.bte_solver._op_spec_sp[grid_idx]
+            ev_max_ext  = (spec_sp._basis_p._t[-1] * vth[grid_idx] / self.param.c_gamma)**2
+            if  ev_max_ext > 15.76:
+                active_grid_idx.append(grid_idx)
+        
+        self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
+        self.sub_clusters_run = False
         self.profile_tt[pp.SETUP].stop()
         return
 
@@ -411,14 +408,15 @@ def asnumpy(a):
 
         return
     
-    async def fetch(self, interface):
-        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+    async def fetch(self, interface, use_interp:bool):
+        gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
         heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
         tps_npts                = len(heavy_temp)
         self.tps_npts           = tps_npts
+        nspecies                = interface.Nspecies()
         electron_temp           = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
         efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
-        species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(3, tps_npts)
+        species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
         
         # np.save("n0.npy", species_densities[TPSINDEX.NEU_IDX])
         # np.save("ne.npy", species_densities[TPSINDEX.ELE_IDX])
@@ -429,66 +427,175 @@ async def fetch(self, interface):
         # np.save("E.npy" , np.sqrt(efield[0]**2 + efield[1]**2))
         # sys.exit(-1)
         
-        n_grids                 = self.param.n_grids 
-        use_gpu                 = self.param.use_gpu
-        n_grids                 = self.param.n_grids
+        n_grids            = self.param.n_grids 
+        use_gpu            = self.param.use_gpu
         
-        gidx_to_device_map      = self.gidx_to_device_map
+        Tg                 = heavy_temp
+        n0                 = species_densities[TPSINDEX.NEU_IDX]
+        ne                 = species_densities[TPSINDEX.ELE_IDX]
+        ni                 = species_densities[TPSINDEX.ION_IDX]
+    
+        Ex                 = efield[0]
+        Ey                 = efield[1]
+    
+        ExbyN              = Ex/n0/self.param.Td_fac
+        EybyN              = Ey/n0/self.param.Td_fac
         
-        for grid_idx in range(n_grids):
-            bte_idx           = gidx_to_pidx_map[grid_idx]
-            dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
-            
-            ni                = species_densities[TPSINDEX.ION_IDX][bte_idx]
-            ne                = species_densities[TPSINDEX.ELE_IDX][bte_idx]
-            n0                = species_densities[TPSINDEX.NEU_IDX][bte_idx]
-            Tg                = heavy_temp[bte_idx]
-            Te                = electron_temp[bte_idx]
-            eRe               = efield[TPSINDEX.EF_RE_IDX][bte_idx]
-            eIm               = efield[TPSINDEX.EF_IM_IDX][bte_idx]
-            
-            eMag              = np.sqrt(eRe**2 + eIm **2)
-            eByn0             = eMag/n0/self.param.Td_fac
+        Ex                 = ExbyN * self.param.n0 * self.param.Td_fac
+        Ey                 = EybyN * self.param.n0 * self.param.Td_fac
+        
+        ion_deg            = np.zeros_like(ne) #ne/n0
+
+        m_bte              = np.concatenate((ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1,1)) ), axis=1)
+        
+        self.sub_cluster_idx_to_pidx = None
+        self.sub_cluster_c           = None          
+        gidx_to_device_map           = self.gidx_to_device_map
+        
+        if (use_interp == True):
+            n_sub_clusters               = self.param.n_sub_clusters
+            self.sub_cluster_idx_to_pidx = [[None for i in range(n_sub_clusters)] for i in range(self.param.n_grids)]
+            self.sub_cluster_c           = [None for i in range(self.param.n_grids)]
             
-            if self.param.verbose == 1 :
-                print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
-                print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
-                
-                print("E/n0  (min)               = %.12E [Td]         \t E/n0 (max) = %.12E [Td]    "%(np.min(eByn0), np.max(eByn0)))
-            #     print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
-            #     print("Te    (min)               = %.12E [K]          \t Te   (max) = %.12E [K]     "%(np.min(Te)   , np.max(Te)))
+            def normalize(obs):
+                std_obs   = np.std(obs, axis=0)
+                std_obs[std_obs == 0.0] = 1.0
+                return obs/std_obs, std_obs
+
+            ts = TaskSpace("T")
+            for grid_idx in self.active_grid_idx:
+                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                def t1():
+                    dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
+                    m                            = m_bte[gidx_to_pidx[grid_idx]]
+                    mw , mw_std                  = normalize(m)
+                    mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
+                    mcw0[0:mcw.shape[0], :]      = mcw[:,:]
+                    mcw                          = mcw0
+                    
+                    mc                           = mcw * mw_std
+                    dist_mat                     = np.array([np.linalg.norm(mw - mcw[i], axis=1) for i in range(n_sub_clusters)]).T
+                    membership_m                 = np.argmin(dist_mat, axis=1)
+                    self.sub_cluster_c[grid_idx] = mc
+                    
+                    for c_idx in range(n_sub_clusters):
+                        self.sub_cluster_idx_to_pidx[grid_idx][c_idx] = np.argwhere(membership_m==c_idx)[:,0]
+                        
+                        # idx     = self.sub_cluster_idx_to_pidx[grid_idx][c_idx]
+                        # abs_err = np.linalg.norm(dist_mat[idx, c_idx] - np.linalg.norm(mw[idx] - mcw[c_idx], axis=1))
+                        # print(grid_idx, c_idx, abs_err)
+                    
+                    # dw_mat = np.zeros(self.param.n_sub_clusters)
+                    # print(grid_idx,"\n" , mc)
+                    # for c_idx in range(n_sub_clusters):
+                    #     idx = self.sub_cluster_idx_to_pidx[grid_idx][c_idx]
+                    #     if len(idx>0):
+                    #         dw_mat[c_idx] =  np.max(np.linalg.norm(1 - m[idx] / mc[c_idx], axis = 1))
+                    
+                    # plt.figure(figsize=(8, 8), dpi=300)
+                    # plt.semilogy(np.array(range(self.param.n_sub_clusters)), dw_mat)
+                    # plt.xlabel(r"cluster id")
+                    # plt.ylabel(r"relative error")
+                    # plt.grid(visible=True)
+                    # plt.savefig("%s_grid_idx_%04d.png"%(self.param.out_fname, grid_idx))
+                    # plt.close()
+                    
+                    
+                    n0   = np.ones(mc.shape[0]) * self.param.n0
+                    Ex   = mc[: , 0] * self.param.n0 * self.param.Td_fac
+                    Ey   = mc[: , 1] * self.param.n0 * self.param.Td_fac
+                    
+                    Tg   = mc[: , 2]
+                    ne   = mc[: , 3] * self.param.n0
+                    ni   = mc[: , 3] * self.param.n0
+                    EMag = np.sqrt(Ex**2 + Ey**2)
+                    
+                    if self.param.verbose == 1 :
+                        print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                        
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)))
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)))
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                                            
+                    if (use_gpu==1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                    
+                        
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", Ex)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
+                    
+                    return
                 
-            #     print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
-                print("ni    (min)               = %.12E [1/m^3]      \t ni   (max) = %.12E [1/m^3] "%(np.min(ni)   , np.max(ni)))
-                print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)   , np.max(n0)))
-            
+            await ts
             
-            if (use_gpu == 1):
-                with cp.cuda.Device(dev_id):
-                    ne        = cp.array(ne)
-                    ni        = cp.array(ni)
-                    n0        = cp.array(n0)
-                    Tg        = cp.array(Tg)
-                    Te        = cp.array(Te)
-                    eRe       = cp.array(eRe)
-                    eIm       = cp.array(eIm)
+        else:
+            ts = TaskSpace("T")
+            for grid_idx in self.active_grid_idx:
+                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                def t1():
+                    bte_idx           = gidx_to_pidx[grid_idx]
+                    dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+                    
+                    mc                = m_bte[bte_idx]
                     
-                    eMag      = cp.sqrt(eRe**2 + eIm **2)
-                    eByn0     = eMag/n0/self.param.Td_fac
+                    n0                = np.ones(mc.shape[0]) * self.param.n0
+                    Ex                = mc[: , 0] * self.param.n0 * self.param.Td_fac
+                    Ey                = mc[: , 1] * self.param.n0 * self.param.Td_fac
+                    
+                    Tg                = mc[: , 2]
+                    ne                = mc[: , 3] * self.param.n0
+                    ni                = mc[: , 3] * self.param.n0
+                    EMag              = np.sqrt(Ex**2 + Ey**2)
+                
+                    if self.param.verbose == 1 :
+                        print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq))
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                        
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)))
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)))
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
             
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", eRe)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", eIm)
-            self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , eMag)
-        
             
+                    if (use_gpu == 1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                    
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", Ex)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
+                    return
+            await ts
         return        
 
-    async def solve_init(self):
+    async def solve_init(self, use_interp:bool):
         rank                    = self.comm.Get_rank()
         npes                    = self.comm.Get_size()
         n_grids                 = self.param.n_grids
@@ -507,7 +614,12 @@ def t1():
         def ts_op_setup(grid_idx):
             xp                                      = self.xp_module 
             f_mw                                    = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
-            n_pts                                   = f_mw.shape[1]
+            
+            if (use_interp==True):
+                n_pts                               = self.param.n_sub_clusters
+            else:
+                n_pts                               = f_mw.shape[1]
+                
             Qmat                                    = self.bte_solver._op_qmat[grid_idx]
             INr                                     = xp.eye(Qmat.shape[1])
             self.bte_solver._op_imat_vx[grid_idx]   = xp.einsum("i,jk->ijk",xp.ones(n_pts), INr)
@@ -516,25 +628,33 @@ def ts_op_setup(grid_idx):
             self.xp_module = cp
             ts = TaskSpace("T")
             
-            for grid_idx in range(self.param.n_grids):
+            for grid_idx in self.active_grid_idx:
                 dev_id = gidx_to_device_map(grid_idx, n_grids)
                 @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
                 def t1():
                     ts_op_setup(grid_idx)
                     f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
-                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+                    
+                    if (use_interp==True):
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw[: , 0:self.param.n_sub_clusters])
+                    else:
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
                     
             
             await ts
         else:
             self.xp_module = np
+            
             ts = TaskSpace("T")
-            for grid_idx in range(self.param.n_grids):
+            for grid_idx in self.active_grid_idx:
                 @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
                 def t1():
                     ts_op_setup(grid_idx)
                     f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
-                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+                    if (use_interp==True):
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw[: , 0:self.param.n_sub_clusters])
+                    else:
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
             
             await ts
         
@@ -546,18 +666,11 @@ async def solve_step(self, time, delta_t):
         """
         rank                    = self.comm.Get_rank()
         npes                    = self.comm.Get_size()
-        # xp                      = self.xp_module
-        # csv_write               = self.param.export_csv
-        # plot_data               = self.param.plot_data
-        # gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
-        # use_gpu                 = self.param.use_gpu
-        # dev_id                  = self.param.dev_id
-        # verbose                 = self.param.verbose
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
         
         ts = TaskSpace("T")
-        for grid_idx in range(n_grids):
+        for grid_idx in self.active_grid_idx:
             @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
             def t1():
                 u0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
@@ -697,7 +810,7 @@ def asnumpy(a):
 
         return
     
-    async def push(self, interface):
+    async def push(self, interface, use_interp:bool):
         xp                      = self.xp_module
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
@@ -709,24 +822,49 @@ async def push(self, interface):
         n_reactions = interface.nComponents(libtps.t2bIndex.ReactionRates)
         rates       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((n_reactions, tps_npts))
         
-        if(n_reactions>0):
-            ts = TaskSpace("T")
-            for grid_idx in range(n_grids):
-                @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
-                def t1():
-                    qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
-                    u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                    
-                    h_curr   = xp.dot(qA, u0)
-                    h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
-                    qoi      = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
-                    
-                    rates[0][gidx_to_pidx_map[grid_idx]] = xp.asnumpy(qoi["rates"][1])
-                    
-                    
-            await ts
-            rates = rates.reshape((-1))
+        if (use_interp==True):
+            if(n_reactions>0):
+                rates[:,:] = 0.0
+                ts = TaskSpace("T")
+                for grid_idx in self.active_grid_idx:
+                    @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                    def t1():
+                        qA        = boltzmann.bte_solver._op_diag_dg[grid_idx]
+                        u0        = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        
+                        h_curr    = xp.dot(qA, u0)
+                        h_curr    = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+                        qoi       = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        
+                        rr_interp = np.zeros(len(gidx_to_pidx_map[grid_idx]))
+                        rr_cpu    = xp.asnumpy(qoi["rates"][TPSINDEX.RR_IDX[0]])
+                        
+                        for c_idx in range(self.param.n_sub_clusters):
+                            rr_interp[self.sub_cluster_idx_to_pidx[grid_idx][c_idx]] = rr_cpu[c_idx] * self.param.N_Avo
+                        
+                        rates[0][gidx_to_pidx_map[grid_idx]] = rr_interp
+                        
+                await ts
+                rates = rates.reshape((-1))
+        else:
+            if(n_reactions>0):
+                rates[:,:] = 0.0
+                ts = TaskSpace("T")
+                for grid_idx in self.active_grid_idx:
+                    @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                    def t1():
+                        qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
+                        u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        
+                        h_curr   = xp.dot(qA, u0)
+                        h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+                        qoi      = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        rates[0][gidx_to_pidx_map[grid_idx]] = xp.asnumpy(qoi["rates"][TPSINDEX.RR_IDX[0]]) * self.param.N_Avo
+                        
+                await ts
+                rates = rates.reshape((-1))
         return 
+    
         
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
@@ -751,7 +889,9 @@ def t1():
         
         @spawn(placement=cpu, vcus=0)
         async def __main__():
-            await boltzmann.solve_init()    
+            
+            bte_use_interp = True
+            await boltzmann.solve_init(bte_use_interp)
             xp = boltzmann.bte_solver.xp_module
 
             max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
@@ -759,31 +899,39 @@ async def __main__():
             tt        = 0#interface.currentTime()
             tau       = (1/boltzmann.param.Efreq)
             dt_tps    = interface.timeStep()
-            dt_bte    = boltzmann.param.dt * (dt_tps)
+            dt_bte    = 1e-2 * tau #boltzmann.param.dt * (dt_tps)
             bte_steps = int(dt_tps/dt_bte)
             n_grids   = boltzmann.param.n_grids
             
-            cycle_freq = int(xp.ceil(tau/dt_tps))
+            cycle_freq = 10 #int(xp.ceil(tau/dt_tps))
             gidx_to_device_map = boltzmann.gidx_to_device_map
+            
+            tps_sper_cycle = int(xp.ceil(tau/dt_tps))
+            bte_sper_cycle = int(xp.ceil(tau/dt_bte))
+            
+            print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
+            
             while (iter<max_iters):
-                t1 = time()
-                tps.solveStep()
-                t2 = time()
+                tt_tps = tt
                 
-                t1 = min_mean_max(t2-t1, comm)
-                print("[TPS] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt/tau, t1[0],t1[1],t1[2]))
+                for tps_idx in range(tps_sper_cycle):
+                    t1 = time()
+                    tps.solveStep()
+                    t2 = time()
+                    t1 = min_mean_max(t2-t1, comm)
+                    print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps/tau, t1[0],t1[1],t1[2]))
+                    tt_tps +=dt_tps
                 
                 tps.push(interface)
-                await boltzmann.fetch(interface)
+                await boltzmann.fetch(interface, use_interp=bte_use_interp)
                 
                 tt_bte       = tt
                 u_avg        = [0 for i in range(n_grids)] 
-                cycle_f1     = (0.5 * dt_bte/ dt_tps)
-                
-                for bte_step_idx in range(bte_steps):
+                cycle_f1     = (0.5 * dt_bte/ (bte_sper_cycle * dt_bte))
                 
+                for bte_idx in range(bte_sper_cycle):
                     ts = TaskSpace("T")
-                    for grid_idx in range(n_grids):
+                    for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                         def t1():
                             u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
@@ -793,10 +941,10 @@ def t1():
                     await boltzmann.solve_step(tt_bte, dt_bte)
                     t2 = time()
                     t1 = min_mean_max(t2-t1, comm)
-                    print("[BTE] simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tt_bte/tau, t1[0],t1[1],t1[2]))
+                    print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte/tau, t1[0],t1[1],t1[2]))
                     
                     ts = TaskSpace("T")
-                    for grid_idx in range(n_grids):
+                    for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                         def t1():
                             u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
@@ -806,16 +954,17 @@ def t1():
                     tt_bte += dt_bte
                 
                 ts = TaskSpace("T")
-                for grid_idx in range(n_grids):
+                for grid_idx in boltzmann.active_grid_idx:
                     @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                     def t1():
                         boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
                 await ts
-                await boltzmann.push(interface)
+                await boltzmann.push(interface, use_interp=bte_use_interp)
+                
                 if (iter%cycle_freq==0):
                     interface.saveDataCollection(cycle=(iter//cycle_freq), time=tt/tau)
                 tps.fetch(interface)
-                tt += dt_tps
+                tt += dt_tps * tps_sper_cycle
                 iter+=1
         
 

From 7e449e4527df66e84ae776cc64984bf917aa1c6f Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Thu, 15 Feb 2024 18:33:33 -0600
Subject: [PATCH 43/75] now we evolve the tps code until steady state once the
 BTE rate coefficients computed.

---
 src/tps-bte_0d3v.py | 232 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 205 insertions(+), 27 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 1d8e29095..2ad2ead8d 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -283,6 +283,7 @@ def grid_setup(self, interface):
                 active_grid_idx.append(grid_idx)
         
         self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
+        #self.active_grid_idx  = [i for i in range(self.param.n_grids)]
         self.sub_clusters_run = False
         self.profile_tt[pp.SETUP].stop()
         return
@@ -626,8 +627,8 @@ def ts_op_setup(grid_idx):
             
         if(self.param.use_gpu==1):
             self.xp_module = cp
-            ts = TaskSpace("T")
             
+            ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
                 dev_id = gidx_to_device_map(grid_idx, n_grids)
                 @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
@@ -636,15 +637,14 @@ def t1():
                     f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
                     
                     if (use_interp==True):
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw[: , 0:self.param.n_sub_clusters])
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw[: , 0:self.param.n_sub_clusters]))
                     else:
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
                     
             
             await ts
         else:
             self.xp_module = np
-            
             ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
                 @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
@@ -652,9 +652,9 @@ def t1():
                     ts_op_setup(grid_idx)
                     f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
                     if (use_interp==True):
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw[: , 0:self.param.n_sub_clusters])
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw[: , 0:self.param.n_sub_clusters]))
                     else:
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", f_mw)
+                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw))
             
             await ts
         
@@ -673,9 +673,18 @@ async def solve_step(self, time, delta_t):
         for grid_idx in self.active_grid_idx:
             @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
             def t1():
+                
+                # seting the E field for time t + dt (implicit step)
+                xp    = self.bte_solver.xp_module
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                Et    = eRe * xp.cos(2 * xp.pi * self.param.Efreq * (time + delta_t)) + eIm * xp.sin(2 * xp.pi * self.param.Efreq * (time + delta_t))
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "E", Et)
+                
                 u0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                 v     = self.bte_solver.step(grid_idx, u0, self.param.atol, self.param.rtol, self.param.max_iter, time, delta_t)
                 self.bte_solver.set_boltzmann_parameter(grid_idx, "u1", v)
+                
         await ts
         
         return 
@@ -846,6 +855,7 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
+                rates[rates<0] = 0.0
         else:
             if(n_reactions>0):
                 rates[:,:] = 0.0
@@ -863,9 +873,96 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
+                rates[rates<0] = 0.0
         return 
     
+    def io_output_data(self, grid_idx, u0, plot_data:bool, export_csv:bool, fname:str):
+        xp                      = self.xp_module
+        gidx_to_device_map      = self.gidx_to_device_map
+        n_grids                 = self.param.n_grids
+        dev_id                  = gidx_to_device_map(grid_idx, n_grids)
+        qA                      = self.bte_solver._op_diag_dg[grid_idx]
+        h_curr                  = xp.dot(qA, u0)
+        h_curr                  = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+        ff                      = h_curr
+        qoi                     = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+
+
+        def asnumpy(a):
+            if cp.get_array_module(a)==cp:
+                with cp.cuda.Device(dev_id):
+                    return cp.asnumpy(a)
+            else:
+                return a
+            
+        ff_cpu   = asnumpy(ff)
+        ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
+        ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff_cpu)
+        
+        n0       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "n0"))
+        ne       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ne"))
+        ni       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ni"))
+        Tg       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg"))
+        eRe      = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe"))
+        eIm      = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
+        eMag     = np.sqrt(eRe**2 + eIm**2)
         
+        data_csv = np.zeros((ne.shape[0], 8 + len((self.param.collisions))))    
+
+        if export_csv:
+            data_csv[: , 0]    = n0
+            data_csv[: , 1]    = ne
+            data_csv[: , 2]    = ni
+            data_csv[: , 3]    = Tg
+            data_csv[: , 4]    = eMag
+            data_csv[: , 5]    = asnumpy(qoi["energy"])
+            data_csv[: , 6]    = asnumpy(qoi["mobility"])
+            data_csv[: , 7]    = asnumpy(qoi["diffusion"])
+                
+            for col_idx, g in enumerate(self.param.collisions):
+                data_csv[: , 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
+                
+            
+            with open("%s_qoi.csv"%(fname), 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(self.param.collisions):
+                    header.append(str(g))
+                
+                writer.writerow(header)
+                writer.writerows(data_csv)
+
+        if plot_data:
+            n_pts        = ff_cpu.shape[1]
+            num_sh       = len(self.bte_solver._par_lm[grid_idx])
+            num_subplots = num_sh 
+            num_plt_cols = min(num_sh, 4)
+            num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
+            fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=200, constrained_layout=True)
+            plt_idx      =  1
+            n_pts_step   =  n_pts // 20
+
+            for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
+                plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
+                for ii in range(0, n_pts, n_pts_step):
+                    fr = np.abs(ff_r[ii, lm_idx, :])
+                    #plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                
+                #plt.xlabel(r"energy (eV)")
+                #plt.ylabel(r"$f_%d$"%(lm[0]))
+                plt.grid(visible=True)
+                if lm_idx==0:
+                    plt.legend(prop={'size': 6})
+                    
+                plt_idx +=1
+            
+            plt.savefig("%s_plot.png"%(fname))
+            plt.close()
+            
+        return        
+        
+            
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
     
@@ -903,33 +1000,73 @@ async def __main__():
             bte_steps = int(dt_tps/dt_bte)
             n_grids   = boltzmann.param.n_grids
             
-            cycle_freq = 10 #int(xp.ceil(tau/dt_tps))
+            cycle_freq           = 1 #int(xp.ceil(tau/dt_tps))
+            terminal_output_freq = -1
             gidx_to_device_map = boltzmann.gidx_to_device_map
             
             tps_sper_cycle = int(xp.ceil(tau/dt_tps))
             bte_sper_cycle = int(xp.ceil(tau/dt_bte))
+            bte_max_cycles = 10
+            tps_max_cycles = 1000
             
             print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
-            
+            tps.solveStep()
+            tps.push(interface)
+            p_t1 = 0 
+            p_t2 = 0
             while (iter<max_iters):
-                tt_tps = tt
                 
-                for tps_idx in range(tps_sper_cycle):
-                    t1 = time()
-                    tps.solveStep()
-                    t2 = time()
-                    t1 = min_mean_max(t2-t1, comm)
-                    print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps/tau, t1[0],t1[1],t1[2]))
-                    tt_tps +=dt_tps
+                if (iter%cycle_freq==0):
+                    interface.saveDataCollection(cycle=(iter//cycle_freq), time=iter)
                 
-                tps.push(interface)
+                ########################## BTE solve ##################################################
                 await boltzmann.fetch(interface, use_interp=bte_use_interp)
+                tt_bte       = 0
+                bte_u        = [0 for i in range(n_grids)]
+                bte_v        = [0 for i in range(n_grids)]
+                
+                u_avg        = [0 for i in range(n_grids)]
                 
-                tt_bte       = tt
-                u_avg        = [0 for i in range(n_grids)] 
+                abs_error    = [0 for i in range(n_grids)]
+                rel_error    = [0 for i in range(n_grids)]
                 cycle_f1     = (0.5 * dt_bte/ (bte_sper_cycle * dt_bte))
                 
-                for bte_idx in range(bte_sper_cycle):
+                for bte_idx in range(bte_sper_cycle * bte_max_cycles +1):
+                    
+                    if (bte_idx % bte_sper_cycle == 0):
+                        
+                        ts = TaskSpace("T")
+                        for grid_idx in boltzmann.active_grid_idx:
+                            @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                            def t1():
+                                u0      = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                                fname   = "%s_iter%04d_grid_%04d_cycle_%0d"%(boltzmann.param.out_fname, iter, grid_idx, bte_idx//bte_sper_cycle)
+                                
+                                #boltzmann.io_output_data(grid_idx, u0, False, True, fname)
+                                
+                                abs_error[grid_idx] = xp.max(xp.abs(bte_v[grid_idx]-u0))
+                                rel_error[grid_idx] = abs_error[grid_idx] / xp.max(xp.abs(u0))
+                                #print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E"%(bte_idx, tt_bte, abs_error[grid_idx], rel_error[grid_idx]))
+                                
+                                # if(bte_idx >0):
+                                #     print(grid_idx, " u_ptr ", u_avg[grid_idx].data, " v_ptr " , v_avg[grid_idx].data)
+                                
+                                bte_v[grid_idx] = xp.copy(u0)
+                                
+                        await ts
+                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]))
+                        
+                        if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
+                            break
+                        
+                        if bte_idx < bte_sper_cycle * bte_max_cycles:
+                            u_avg  = [0 for i in range(n_grids)]
+                            
+                            
+                    if bte_idx == bte_sper_cycle * bte_max_cycles :
+                        break    
+                        
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
@@ -937,11 +1074,13 @@ def t1():
                             u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                     await ts
                     
-                    t1 = time()
+                    p_t1 = time()
                     await boltzmann.solve_step(tt_bte, dt_bte)
-                    t2 = time()
-                    t1 = min_mean_max(t2-t1, comm)
-                    print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte/tau, t1[0],t1[1],t1[2]))
+                    p_t2 = time()
+                    
+                    if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
+                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte, p_t3[0], p_t3[1], p_t3[2]))
                     
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
@@ -961,10 +1100,49 @@ def t1():
                 await ts
                 await boltzmann.push(interface, use_interp=bte_use_interp)
                 
-                if (iter%cycle_freq==0):
-                    interface.saveDataCollection(cycle=(iter//cycle_freq), time=tt/tau)
+                
+                ################### tps solve ######################################
                 tps.fetch(interface)
-                tt += dt_tps * tps_sper_cycle
+                tps_u  = 0
+                tps_v  = 0
+                tt_tps = 0
+                
+                p_t1   = 0
+                p_t2   = 0
+                for tps_idx in range(tps_sper_cycle * tps_max_cycles + 1):
+                    if (tps_idx % tps_sper_cycle == 0):
+                        
+                        tps.push(interface)
+                        nspecies            = interface.Nspecies()
+                        heavy_temp          = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+                        tps_npts            = len(heavy_temp)
+                        tps_u               = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
+                        # rates               = np.array(interface.HostRead(libtps.t2bIndex.ReactionRates), copy=False).reshape((1, tps_npts))
+                        # print("rates", np.min(rates[0]), np.max(rates[0]))
+                        
+                        abs_error           = np.linalg.norm(tps_u - tps_v, axis=1)
+                        rel_error           = abs_error / np.linalg.norm(tps_u, axis=1)
+                        tps_v               = np.copy(tps_u)
+                        
+                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]))
+                        if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
+                            break
+                    
+                    if (tps_idx == tps_sper_cycle * tps_max_cycles):
+                        break
+                    
+                    p_t1 = time()
+                    tps.solveStep()
+                    p_t2 = time()
+                    if(terminal_output_freq > 0 and tps_idx % terminal_output_freq ==0):
+                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps, p_t3[0],p_t3[1],p_t3[2]))
+                    tt_tps +=dt_tps
+                
+                tps.push(interface)
+                
+                tt += dt_tps * tps_idx
                 iter+=1
         
 

From d81feaafcefe6d372ad102e49119ee21356157c4 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Tue, 27 Feb 2024 14:12:41 -0600
Subject: [PATCH 44/75] 6-species collision model added for the tps batched
 sover,  * For the data output now added additional collisions and mole
 fractions.  * random seed parameter initialized for reproducability between
 runs

---
 src/tps-bte_0d3v.py | 377 ++++++++++++++++++++++++++------------------
 1 file changed, 225 insertions(+), 152 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 2ad2ead8d..3cfbfdc53 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -50,19 +50,18 @@ def reset(self):
 def min_mean_max(a, comm: MPI.Comm):
     return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
 
-
-try:
-    df    = pd.read_csv("ionization_rates.csv")
-    Te    = np.array(df["Te[K]"]) 
-    r_arr = np.array(df["Arr[m3/s]"])
-    r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
-    r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
-    r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
-    print("ionization coefficient read from file ")
-except:
-    print("ionization rate coefficient file not found!!")
-    r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
-    r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+# try:
+#     df    = pd.read_csv("ionization_rates.csv")
+#     Te    = np.array(df["Te[K]"]) 
+#     r_arr = np.array(df["Arr[m3/s]"])
+#     r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
+#     r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
+#     r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
+#     print("ionization coefficient read from file ")
+# except:
+#     print("ionization rate coefficient file not found!!")
+#     r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
+#     r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
 
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
@@ -94,7 +93,7 @@ class BoltzmannSolverParams():
     l_max         = 1           # spherical modes uses, 0, to l_max
     ev_max        = 16          # v-space grid truncation (eV)
     n_grids       = 4           # number of v-space grids
-    n_sub_clusters= 300         # number of sub-clusters
+    n_sub_clusters= 200         # number of sub-clusters
 
     dt            = 1e-3        # [] non-dimentionalized time w.r.t. oscilation period
     cycles        = 10             # number of max cycles to evolve
@@ -107,7 +106,7 @@ class BoltzmannSolverParams():
     use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
     dev_id        = 0           # which GPU device to use only used when use_gpu=1
 
-    collisions    = ["g0","g2"] # collision string g0-elastic, g2-ionization
+    collisions    = ""          # collision string g0-elastic, g2-ionization
     export_csv    = 1           # export the qois to csv file
     plot_data     = 1
     
@@ -131,14 +130,16 @@ class BoltzmannSolverParams():
     
     n0            = 3.22e22 #[m^{-3}]
     
+    rand_seed     = 0
+    
     
 class TPSINDEX():
     """
     simple index map to differnt fields, from the TPS arrays
     """
-    ION_IDX = 0                         # ion      density index
-    ELE_IDX = 1                         # electron density index
-    NEU_IDX = 2                         # neutral  density index
+    # ION_IDX = 0                         # ion      density index
+    # ELE_IDX = 1                         # electron density index
+    # NEU_IDX = 2                         # neutral  density index
     
     EF_RE_IDX = 0                       # Re(E) index
     EF_IM_IDX = 1                       # Im(E) index
@@ -146,7 +147,20 @@ class TPSINDEX():
     # in future we need to setup this methodically
     # here key denotes the idx running from 0, nreactions-1
     # value denotes the reaction index in the qoi array
-    RR_IDX   = {0:1}                    
+    RR_IDX   = {0 : 4 , 1 : 5 , 2 : 6, 3 : 7, 4 : 1 , 5 : 2, 6 : 3 }
+    
+    
+    ION_IDX  = 3
+    ELE_IDX  = 4
+    NEU_IDX  = 5
+    EX1_IDX  = 0
+    EX2_IDX  = 1
+    EX3_IDX  = 2
+    
+    MOLE_FRAC_IDX = {0: NEU_IDX, 1: EX1_IDX , 2: EX2_IDX , 3: EX3_IDX} 
+    
+    
+                        
     
 class Boltzmann0D2VBactchedSolver:
     
@@ -195,6 +209,7 @@ def __parse_config_file__(self, fname):
         self.param.Nr               = int(config.get("boltzmannSolver", "Nr").split("#")[0].strip())
         self.param.l_max            = int(config.get("boltzmannSolver", "l_max").split("#")[0].strip())
         self.param.n_grids          = int(config.get("boltzmannSolver", "n_grids").split("#")[0].strip())
+        self.param.n_sub_clusters   = int(config.get("boltzmannSolver", "n_sub_clusters").split("#")[0].strip())
         self.param.dt               = float(config.get("boltzmannSolver", "dt").split("#")[0].strip())
         self.param.cycles           = float(config.get("boltzmannSolver", "cycles").split("#")[0].strip())
         self.param.solver_type      = str(config.get("boltzmannSolver", "solver_type").split("#")[0].strip()) 
@@ -203,7 +218,7 @@ def __parse_config_file__(self, fname):
         self.param.max_iter         = int(config.get("boltzmannSolver", "max_iter").split("#")[0].strip())
         self.param.ee_collisions    = int(config.get("boltzmannSolver", "ee_collisions").split("#")[0].strip())
         self.param.use_gpu          = int(config.get("boltzmannSolver", "use_gpu").split("#")[0].strip())
-        #self.param.collisions       = config.get("boltzmannSolver", "collisions").split("#")[0]
+        self.param.collisions       = str(config.get("boltzmannSolver", "collisions").split("#")[0].strip())
         
         self.param.export_csv       = int(config.get("boltzmannSolver", "export_csv").split("#")[0].strip())
         self.param.plot_data        = int(config.get("boltzmannSolver", "plot_data").split("#")[0].strip())
@@ -275,15 +290,15 @@ def grid_setup(self, interface):
             self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian"))
         
         
-        active_grid_idx=list()
-        for grid_idx in range(n_grids):
-            spec_sp     = self.bte_solver._op_spec_sp[grid_idx]
-            ev_max_ext  = (spec_sp._basis_p._t[-1] * vth[grid_idx] / self.param.c_gamma)**2
-            if  ev_max_ext > 15.76:
-                active_grid_idx.append(grid_idx)
+        # active_grid_idx=list()
+        # for grid_idx in range(n_grids):
+        #     spec_sp     = self.bte_solver._op_spec_sp[grid_idx]
+        #     ev_max_ext  = (spec_sp._basis_p._t[-1] * vth[grid_idx] / self.param.c_gamma)**2
+        #     if  ev_max_ext > 15.76:
+        #         active_grid_idx.append(grid_idx)
         
-        self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
-        #self.active_grid_idx  = [i for i in range(self.param.n_grids)]
+        # self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
+        self.active_grid_idx  = [i for i in range(self.param.n_grids)]
         self.sub_clusters_run = False
         self.profile_tt[pp.SETUP].stop()
         return
@@ -301,9 +316,11 @@ def solve_wo_parla(self):
         
         self.qoi         = [None for grid_idx in range(self.param.n_grids)]
         self.ff          = [None for grid_idx in range(self.param.n_grids)]
+        coll_list        = self.bte_solver.get_collision_list()
+        coll_names       = self.bte_solver.get_collision_names()
         
         if csv_write: 
-            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
+            data_csv = np.empty((self.tps_npts, 8 + len(coll_list)))
         
         t1 = time()
         for grid_idx in range(n_grids):
@@ -366,7 +383,7 @@ def asnumpy(a):
                     data_csv[gidx_to_pidx_map[grid_idx], 6]    = asnumpy(qoi["mobility"])
                     data_csv[gidx_to_pidx_map[grid_idx], 7]    = asnumpy(qoi["diffusion"])
                     
-                    for col_idx, g in enumerate(self.param.collisions):
+                    for col_idx, g in enumerate(coll_list):
                         data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
 
                 if plot_data:
@@ -401,8 +418,8 @@ def asnumpy(a):
                     writer = csv.writer(f,delimiter=',')
                     # write the header
                     header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                    for col_idx, g in enumerate(self.param.collisions):
-                        header.append(str(g))
+                    for col_idx, g in enumerate(coll_list):
+                        header.append(str(coll_names[col_idx]))
                     
                     writer.writerow(header)
                     writer.writerows(data_csv)
@@ -419,6 +436,11 @@ async def fetch(self, interface, use_interp:bool):
         efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
         species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
         
+        cs_avail_species        = self.bte_solver._avail_species
+        
+        n0                      = np.sum(species_densities, axis=0) - species_densities[TPSINDEX.ELE_IDX]
+        ns_by_n0                = np.concatenate([species_densities[TPSINDEX.MOLE_FRAC_IDX[i]]/n0 for i in range(len(cs_avail_species))]).reshape((len(cs_avail_species), tps_npts))
+        
         # np.save("n0.npy", species_densities[TPSINDEX.NEU_IDX])
         # np.save("ne.npy", species_densities[TPSINDEX.ELE_IDX])
         # np.save("ni.npy", species_densities[TPSINDEX.ION_IDX])
@@ -432,10 +454,7 @@ async def fetch(self, interface, use_interp:bool):
         use_gpu            = self.param.use_gpu
         
         Tg                 = heavy_temp
-        n0                 = species_densities[TPSINDEX.NEU_IDX]
-        ne                 = species_densities[TPSINDEX.ELE_IDX]
-        ni                 = species_densities[TPSINDEX.ION_IDX]
-    
+        
         Ex                 = efield[0]
         Ey                 = efield[1]
     
@@ -445,9 +464,8 @@ async def fetch(self, interface, use_interp:bool):
         Ex                 = ExbyN * self.param.n0 * self.param.Td_fac
         Ey                 = EybyN * self.param.n0 * self.param.Td_fac
         
-        ion_deg            = np.zeros_like(ne) #ne/n0
-
-        m_bte              = np.concatenate((ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1,1)) ), axis=1)
+        ion_deg            = species_densities[TPSINDEX.ELE_IDX]/n0
+        m_bte              = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
         
         self.sub_cluster_idx_to_pidx = None
         self.sub_cluster_c           = None          
@@ -470,6 +488,10 @@ def t1():
                     dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
                     mw , mw_std                  = normalize(m)
+                    
+                    # to repoduce clusters
+                    np.random.seed(self.param.rand_seed)
+                    
                     mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
                     mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
@@ -503,24 +525,28 @@ def t1():
                     # plt.close()
                     
                     
-                    n0   = np.ones(mc.shape[0]) * self.param.n0
-                    Ex   = mc[: , 0] * self.param.n0 * self.param.Td_fac
-                    Ey   = mc[: , 1] * self.param.n0 * self.param.Td_fac
-                    
-                    Tg   = mc[: , 2]
-                    ne   = mc[: , 3] * self.param.n0
-                    ni   = mc[: , 3] * self.param.n0
-                    EMag = np.sqrt(Ex**2 + Ey**2)
+                    n0       = np.ones(mc.shape[0]) * self.param.n0
+                    Ex       = mc[: , 0] * self.param.n0 * self.param.Td_fac
+                    Ey       = mc[: , 1] * self.param.n0 * self.param.Td_fac
+                    Tg       = mc[: , 2]
+                    ne       = mc[: , 3] * self.param.n0
+                    ni       = mc[: , 3] * self.param.n0
+                    ns_by_n0 = np.transpose(mc[: , 4:])
+                    EMag     = np.sqrt(Ex**2 + Ey**2)
                     
                     if self.param.verbose == 1 :
                         print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
                         print("Efreq = %.4E [1/s]" %(self.param.Efreq))
                         print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                         
-                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)))
-                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)))
-                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
-                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)       , np.max(n0)))
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN)    , np.max(ExbyN)))
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN)    , np.max(EybyN)))
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)       , np.max(Tg)))
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)       , np.max(ne)))
+                        
+                        for i in range(ns_by_n0.shape[0]):
+                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])))
                                             
                     if (use_gpu==1):
                         with cp.cuda.Device(dev_id):
@@ -531,8 +557,9 @@ def t1():
                             ne   = cp.array(ne)
                             ni   = cp.array(ni)
                             EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
                     
-                        
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)    
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
@@ -562,6 +589,8 @@ def t1():
                     Tg                = mc[: , 2]
                     ne                = mc[: , 3] * self.param.n0
                     ni                = mc[: , 3] * self.param.n0
+                    ns_by_n0          = np.transpose(mc[: , 4:])
+                    
                     EMag              = np.sqrt(Ex**2 + Ey**2)
                 
                     if self.param.verbose == 1 :
@@ -573,6 +602,9 @@ def t1():
                         print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)))
                         print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
                         print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                        
+                        for i in range(ns_by_n0.shape[0]):
+                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])))
             
             
                     if (use_gpu == 1):
@@ -584,7 +616,9 @@ def t1():
                             Ey   = cp.array(Ey)
                             Tg   = cp.array(Tg)
                             EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
                     
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
@@ -708,6 +742,8 @@ async def solve(self):
         self.qoi                = [None for grid_idx in range(self.param.n_grids)]
         self.ff                 = [None for grid_idx in range(self.param.n_grids)]
         num_gpus                = len(gpu)
+        coll_list               = self.bte_solver.get_collision_list()
+        coll_names              = self.bte_solver.get_collision_names()
         
         if (use_gpu==1):
             parla_placement = [gpu(gidx_to_device_map(grid_idx,n_grids)) for grid_idx in range(n_grids)]
@@ -715,16 +751,16 @@ async def solve(self):
             parla_placement = [cpu for grid_idx in range(n_grids)]
 
         if csv_write: 
-            data_csv = np.empty((self.tps_npts, 8 + len(self.param.collisions)))
+            data_csv = np.empty((self.tps_npts, 8 + len(coll_list)))
             
         self.profile_tt[pp.SOLVE].start()
         ts = TaskSpace("T")
-        for grid_idx in range(self.param.n_grids):
+        for grid_idx in self.active_grid_idx:
             @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
             def t1():
                 try:
                     print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]))
-                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                     self.ff[grid_idx]  = ff
                     self.qoi[grid_idx] = qoi
@@ -741,6 +777,7 @@ def t1():
         print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
         print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))
         
+        """
         if (self.param.export_csv ==1 or self.param.plot_data==1):
             for grid_idx in range(n_grids):
                 dev_id   = gidx_to_device_map(grid_idx, n_grids)
@@ -776,7 +813,7 @@ def asnumpy(a):
                     data_csv[gidx_to_pidx_map[grid_idx], 6]    = asnumpy(qoi["mobility"])
                     data_csv[gidx_to_pidx_map[grid_idx], 7]    = asnumpy(qoi["diffusion"])
                     
-                    for col_idx, g in enumerate(self.param.collisions):
+                    for col_idx, g in enumerate(coll_list):
                         data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
 
                 if plot_data:
@@ -811,11 +848,12 @@ def asnumpy(a):
                     writer = csv.writer(f,delimiter=',')
                     # write the header
                     header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                    for col_idx, g in enumerate(self.param.collisions):
-                        header.append(str(g))
+                    for col_idx, g in enumerate(coll_list):
+                        header.append(str(coll_names[col_idx]))
                     
                     writer.writerow(header)
                     writer.writerows(data_csv)
+        """
 
         return
     
@@ -840,18 +878,16 @@ async def push(self, interface, use_interp:bool):
                     def t1():
                         qA        = boltzmann.bte_solver._op_diag_dg[grid_idx]
                         u0        = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        
-                        h_curr    = xp.dot(qA, u0)
-                        h_curr    = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+                        h_curr    = boltzmann.bte_solver.normalized_distribution(grid_idx, u0)
                         qoi       = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        rr_cpu    = xp.asnumpy(qoi["rates"])
                         
                         rr_interp = np.zeros(len(gidx_to_pidx_map[grid_idx]))
-                        rr_cpu    = xp.asnumpy(qoi["rates"][TPSINDEX.RR_IDX[0]])
-                        
-                        for c_idx in range(self.param.n_sub_clusters):
-                            rr_interp[self.sub_cluster_idx_to_pidx[grid_idx][c_idx]] = rr_cpu[c_idx] * self.param.N_Avo
+                        for r_idx in range(n_reactions):
+                            for c_idx in range(self.param.n_sub_clusters):
+                                rr_interp[self.sub_cluster_idx_to_pidx[grid_idx][c_idx]] = rr_cpu[TPSINDEX.RR_IDX[r_idx]][c_idx] * self.param.N_Avo
                         
-                        rates[0][gidx_to_pidx_map[grid_idx]] = rr_interp
+                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_interp
                         
                 await ts
                 rates = rates.reshape((-1))
@@ -865,11 +901,12 @@ def t1():
                     def t1():
                         qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
                         u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        
-                        h_curr   = xp.dot(qA, u0)
-                        h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+                        h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, u0)
                         qoi      = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
-                        rates[0][gidx_to_pidx_map[grid_idx]] = xp.asnumpy(qoi["rates"][TPSINDEX.RR_IDX[0]]) * self.param.N_Avo
+                        rr_cpu   = xp.asnumpy(qoi["rates"])
+                        
+                        for r_idx in range(n_reactions):
+                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_cpu[TPSINDEX.RR_IDX[r_idx]] * self.param.N_Avo
                         
                 await ts
                 rates = rates.reshape((-1))
@@ -886,8 +923,16 @@ def io_output_data(self, grid_idx, u0, plot_data:bool, export_csv:bool, fname:st
         h_curr                  = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
         ff                      = h_curr
         qoi                     = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
-
-
+        coll_list               = self.bte_solver.get_collision_list()
+        coll_names              = self.bte_solver.get_collision_names()
+        cs_data                 = self.bte_solver.get_cross_section_data()
+        
+        cs_species              = list()
+        for col_idx, (k,v) in enumerate(cs_data.items()):
+            cs_species.append(v["species"])
+            
+        cs_species = list(sorted(set(cs_species), key=cs_species.index))
+        
         def asnumpy(a):
             if cp.get_array_module(a)==cp:
                 with cp.cuda.Device(dev_id):
@@ -902,33 +947,36 @@ def asnumpy(a):
         n0       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "n0"))
         ne       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ne"))
         ni       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ni"))
+        ns_by_n0 = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ns_by_n0")).T
         Tg       = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg"))
         eRe      = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe"))
         eIm      = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
         eMag     = np.sqrt(eRe**2 + eIm**2)
         
-        data_csv = np.zeros((ne.shape[0], 8 + len((self.param.collisions))))    
+        data_csv = np.zeros((ne.shape[0], 7 + ns_by_n0.shape[1] + len((coll_list))))    
 
         if export_csv:
             data_csv[: , 0]    = n0
-            data_csv[: , 1]    = ne
-            data_csv[: , 2]    = ni
-            data_csv[: , 3]    = Tg
-            data_csv[: , 4]    = eMag
-            data_csv[: , 5]    = asnumpy(qoi["energy"])
-            data_csv[: , 6]    = asnumpy(qoi["mobility"])
-            data_csv[: , 7]    = asnumpy(qoi["diffusion"])
+            data_csv[: , 1]    = ne/n0
+            idx                =2 + ns_by_n0.shape[1]
+            data_csv[: ,2:idx] = ns_by_n0[:,:]
+            
+            data_csv[: , idx]      = Tg
+            data_csv[: , idx+1]    = eMag
+            data_csv[: , idx+2]    = asnumpy(qoi["energy"])
+            data_csv[: , idx+3]    = asnumpy(qoi["mobility"])
+            data_csv[: , idx+4]    = asnumpy(qoi["diffusion"])
                 
-            for col_idx, g in enumerate(self.param.collisions):
-                data_csv[: , 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
+            for col_idx, g in enumerate(coll_list):
+                data_csv[: , idx+5 + col_idx]    = asnumpy(qoi["rates"][col_idx])
                 
             
             with open("%s_qoi.csv"%(fname), 'w', encoding='UTF8') as f:
                 writer = csv.writer(f,delimiter=',')
                 # write the header
-                header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                for col_idx, g in enumerate(self.param.collisions):
-                    header.append(str(g))
+                header = ["n0", "ne/n0"] + ["(%s)/n0"%(s) for s in cs_species] + ["Tg", "E",  "energy", "mobility", "diffusion"]
+                for col_idx, g in enumerate(coll_list):
+                    header.append(str(coll_names[col_idx]))
                 
                 writer.writerow(header)
                 writer.writerows(data_csv)
@@ -946,11 +994,12 @@ def asnumpy(a):
             for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
                 plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
                 for ii in range(0, n_pts, n_pts_step):
-                    fr = np.abs(ff_r[ii, lm_idx, :])
-                    #plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
+                    fr     = np.abs(ff_r[ii, lm_idx, :])
+                    mf_str = " ".join([r"$%s/n0$=%.2E"%(s, ns_by_n0[ii, s_idx]) for s_idx, s in enumerate(cs_species)])
+                    plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td]"%(Tg[ii], eMag[ii]/n0[ii]/1e-21) + " " +mf_str)
                 
-                #plt.xlabel(r"energy (eV)")
-                #plt.ylabel(r"$f_%d$"%(lm[0]))
+                plt.xlabel(r"energy (eV)")
+                plt.ylabel(r"$f_%d$"%(lm[0]))
                 plt.grid(visible=True)
                 if lm_idx==0:
                     plt.legend(prop={'size': 6})
@@ -1021,86 +1070,110 @@ async def __main__():
                 
                 ########################## BTE solve ##################################################
                 await boltzmann.fetch(interface, use_interp=bte_use_interp)
-                tt_bte       = 0
-                bte_u        = [0 for i in range(n_grids)]
-                bte_v        = [0 for i in range(n_grids)]
-                
-                u_avg        = [0 for i in range(n_grids)]
                 
-                abs_error    = [0 for i in range(n_grids)]
-                rel_error    = [0 for i in range(n_grids)]
-                cycle_f1     = (0.5 * dt_bte/ (bte_sper_cycle * dt_bte))
-                
-                for bte_idx in range(bte_sper_cycle * bte_max_cycles +1):
+                if (boltzmann.param.solver_type=="steady-state"):
+                    await boltzmann.solve()
+                    ts = TaskSpace("T")
+                    for grid_idx in boltzmann.active_grid_idx:
+                        @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                        def t1():
+                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", boltzmann.ff[grid_idx])
+                    await ts
                     
-                    if (bte_idx % bte_sper_cycle == 0):
-                        
+                    await boltzmann.push(interface, use_interp=bte_use_interp)
+                    for grid_idx in boltzmann.active_grid_idx:
+                        u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                    
+                else:
+                    assert boltzmann.param.solver_type == "transient", "unknown BTE solver type"
+                    """
+                    transient BTE solver (evolve until time-periodic solutions)
+                    """
+                    
+                    tt_bte       = 0
+                    bte_u        = [0 for i in range(n_grids)]
+                    bte_v        = [0 for i in range(n_grids)]
+                    
+                    u_avg        = [0 for i in range(n_grids)]
+                    
+                    abs_error    = [0 for i in range(n_grids)]
+                    rel_error    = [0 for i in range(n_grids)]
+                    cycle_f1     = (0.5 * dt_bte/ (bte_sper_cycle * dt_bte))
+                    
+                    for bte_idx in range(bte_sper_cycle * bte_max_cycles +1):
+                        if (bte_idx % bte_sper_cycle == 0):
+                            ts = TaskSpace("T")
+                            for grid_idx in boltzmann.active_grid_idx:
+                                @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                                def t1():
+                                    u0      = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                                    fname   = "%s_iter%04d_grid_%04d_cycle_%0d"%(boltzmann.param.out_fname, iter, grid_idx, bte_idx//bte_sper_cycle)
+                                    
+                                    #boltzmann.io_output_data(grid_idx, u0, False, True, fname)
+                                    
+                                    abs_error[grid_idx] = xp.max(xp.abs(bte_v[grid_idx]-u0))
+                                    rel_error[grid_idx] = abs_error[grid_idx] / xp.max(xp.abs(u0))
+                                    #print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E"%(bte_idx, tt_bte, abs_error[grid_idx], rel_error[grid_idx]))
+                                    
+                                    # if(bte_idx >0):
+                                    #     print(grid_idx, " u_ptr ", u_avg[grid_idx].data, " v_ptr " , v_avg[grid_idx].data)
+                                    
+                                    bte_v[grid_idx] = xp.copy(u0)
+                                    
+                            await ts
+                            p_t3 = min_mean_max(p_t2-p_t1, comm)
+                            print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]))
+                            
+                            if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
+                                break
+                            
+                            if bte_idx < bte_sper_cycle * bte_max_cycles:
+                                u_avg  = [0 for i in range(n_grids)]
+                                
+                        if bte_idx == bte_sper_cycle * bte_max_cycles :
+                            break    
+                            
                         ts = TaskSpace("T")
                         for grid_idx in boltzmann.active_grid_idx:
                             @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                             def t1():
-                                u0      = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
-                                fname   = "%s_iter%04d_grid_%04d_cycle_%0d"%(boltzmann.param.out_fname, iter, grid_idx, bte_idx//bte_sper_cycle)
-                                
-                                #boltzmann.io_output_data(grid_idx, u0, False, True, fname)
-                                
-                                abs_error[grid_idx] = xp.max(xp.abs(bte_v[grid_idx]-u0))
-                                rel_error[grid_idx] = abs_error[grid_idx] / xp.max(xp.abs(u0))
-                                #print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E"%(bte_idx, tt_bte, abs_error[grid_idx], rel_error[grid_idx]))
-                                
-                                # if(bte_idx >0):
-                                #     print(grid_idx, " u_ptr ", u_avg[grid_idx].data, " v_ptr " , v_avg[grid_idx].data)
-                                
-                                bte_v[grid_idx] = xp.copy(u0)
-                                
+                                u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                         await ts
-                        p_t3 = min_mean_max(p_t2-p_t1, comm)
-                        print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]))
                         
-                        if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
-                            break
+                        p_t1 = time()
+                        await boltzmann.solve_step(tt_bte, dt_bte)
+                        p_t2 = time()
                         
-                        if bte_idx < bte_sper_cycle * bte_max_cycles:
-                            u_avg  = [0 for i in range(n_grids)]
-                            
-                            
-                    if bte_idx == bte_sper_cycle * bte_max_cycles :
-                        break    
+                        if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
+                            p_t3 = min_mean_max(p_t2-p_t1, comm)
+                            print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte, p_t3[0], p_t3[1], p_t3[2]))
                         
-                    ts = TaskSpace("T")
-                    for grid_idx in boltzmann.active_grid_idx:
-                        @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
-                        def t1():
-                            u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
-                    await ts
-                    
-                    p_t1 = time()
-                    await boltzmann.solve_step(tt_bte, dt_bte)
-                    p_t2 = time()
+                        ts = TaskSpace("T")
+                        for grid_idx in boltzmann.active_grid_idx:
+                            @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
+                            def t1():
+                                u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
+                                boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1"))
+                        await ts
                     
-                    if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
-                        p_t3 = min_mean_max(p_t2-p_t1, comm)
-                        print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte, p_t3[0], p_t3[1], p_t3[2]))
+                        tt_bte += dt_bte
                     
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                         def t1():
-                            u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
-                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1"))
+                            xp               = boltzmann.xp_module
+                            qA               = boltzmann.bte_solver._op_diag_dg[grid_idx]
+                            u_avg[grid_idx]  = xp.dot(qA, u_avg[grid_idx])
+                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
                     await ts
-                
-                    tt_bte += dt_bte
-                
-                ts = TaskSpace("T")
-                for grid_idx in boltzmann.active_grid_idx:
-                    @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
-                    def t1():
-                        boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
-                await ts
-                await boltzmann.push(interface, use_interp=bte_use_interp)
-                
-                
+                    await boltzmann.push(interface, use_interp=bte_use_interp)
+                    
+                    for grid_idx in boltzmann.active_grid_idx:
+                        u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                    
                 ################### tps solve ######################################
                 tps.fetch(interface)
                 tps_u  = 0

From 170af58093a97e1850c2f636736fbe86fc4062f7 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Fri, 1 Mar 2024 12:57:17 -0600
Subject: [PATCH 45/75] adding e-e collisions for the torch interface with
 lumped cs data

---
 src/tps-bte_0d3v.py | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 3cfbfdc53..c9aefb338 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -69,6 +69,7 @@ def min_mean_max(a, comm: MPI.Comm):
 sys.path.append(path + "/../../boltzmann/BESolver/python")
 import libtps
 from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
+import utils as bte_utils
 
 WITH_PARLA = 1
 if WITH_PARLA:
@@ -464,7 +465,9 @@ async def fetch(self, interface, use_interp:bool):
         Ex                 = ExbyN * self.param.n0 * self.param.Td_fac
         Ey                 = EybyN * self.param.n0 * self.param.Td_fac
         
-        ion_deg            = species_densities[TPSINDEX.ELE_IDX]/n0
+        ion_deg              = species_densities[TPSINDEX.ELE_IDX]/n0
+        ion_deg[ion_deg<0]   = 1e-16
+        ns_by_n0[ns_by_n0<0] = 0
         m_bte              = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
         
         self.sub_cluster_idx_to_pidx = None
@@ -668,7 +671,14 @@ def ts_op_setup(grid_idx):
                 @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
                 def t1():
                     ts_op_setup(grid_idx)
-                    f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    
+                    vth          = self.bte_solver._par_vth[grid_idx]
+                    qA           = self.bte_solver._op_diag_dg[grid_idx]
+                    mw           = bte_utils.get_maxwellian_3d(vth, 1)
+                    mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
+                    f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    f_mw         = f_mw/cp.dot(mm_op, f_mw)
+                    f_mw         = cp.dot(qA.T, f_mw)
                     
                     if (use_interp==True):
                         self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw[: , 0:self.param.n_sub_clusters]))
@@ -684,7 +694,15 @@ def t1():
                 @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
                 def t1():
                     ts_op_setup(grid_idx)
-                    f_mw = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    
+                    vth          = self.bte_solver._par_vth[grid_idx]
+                    qA           = self.bte_solver._op_diag_dg[grid_idx]
+                    mw           = bte_utils.get_maxwellian_3d(vth, 1)
+                    mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
+                    f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    f_mw         = f_mw/np.dot(mm_op, f_mw)
+                    f_mw         = np.dot(qA.T, f_mw)
+                    
                     if (use_interp==True):
                         self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw[: , 0:self.param.n_sub_clusters]))
                     else:
@@ -1045,7 +1063,7 @@ async def __main__():
             tt        = 0#interface.currentTime()
             tau       = (1/boltzmann.param.Efreq)
             dt_tps    = interface.timeStep()
-            dt_bte    = 1e-2 * tau #boltzmann.param.dt * (dt_tps)
+            dt_bte    = boltzmann.param.dt * tau 
             bte_steps = int(dt_tps/dt_bte)
             n_grids   = boltzmann.param.n_grids
             
@@ -1055,8 +1073,8 @@ async def __main__():
             
             tps_sper_cycle = int(xp.ceil(tau/dt_tps))
             bte_sper_cycle = int(xp.ceil(tau/dt_bte))
-            bte_max_cycles = 10
-            tps_max_cycles = 1000
+            bte_max_cycles = int(boltzmann.param.cycles)
+            tps_max_cycles = 500
             
             print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
             tps.solveStep()
@@ -1199,8 +1217,8 @@ def t1():
                         
                         p_t3 = min_mean_max(p_t2-p_t1, comm)
                         print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]))
-                        if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
-                            break
+                        # if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
+                        #     break
                     
                     if (tps_idx == tps_sper_cycle * tps_max_cycles):
                         break

From d5182583756456032344b01e088529c0e3beadb1 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Fri, 1 Mar 2024 20:29:03 -0600
Subject: [PATCH 46/75] minor changes

---
 src/tps-bte_0d3v.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index c9aefb338..17996ea1d 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -909,7 +909,7 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
-                rates[rates<0] = 0.0
+                #rates[rates<0] = 0.0
         else:
             if(n_reactions>0):
                 rates[:,:] = 0.0
@@ -928,7 +928,7 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
-                rates[rates<0] = 0.0
+                #rates[rates<0] = 0.0
         return 
     
     def io_output_data(self, grid_idx, u0, plot_data:bool, export_csv:bool, fname:str):
@@ -1074,7 +1074,7 @@ async def __main__():
             tps_sper_cycle = int(xp.ceil(tau/dt_tps))
             bte_sper_cycle = int(xp.ceil(tau/dt_bte))
             bte_max_cycles = int(boltzmann.param.cycles)
-            tps_max_cycles = 500
+            tps_max_cycles = 100
             
             print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
             tps.solveStep()

From 4d6b6efa94ac147d1be2f35aaaa277a32f8a6d36 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Mon, 4 Mar 2024 09:46:01 -0600
Subject: [PATCH 47/75] negative rate coefficients are set to zero from BTE
 side

---
 src/tps-bte_0d3v.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 17996ea1d..6a07571fe 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -909,7 +909,7 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
-                #rates[rates<0] = 0.0
+                rates[rates<0] = 0.0
         else:
             if(n_reactions>0):
                 rates[:,:] = 0.0
@@ -928,7 +928,7 @@ def t1():
                         
                 await ts
                 rates = rates.reshape((-1))
-                #rates[rates<0] = 0.0
+                rates[rates<0] = 0.0
         return 
     
     def io_output_data(self, grid_idx, u0, plot_data:bool, export_csv:bool, fname:str):

From 09b3e2fb6bee526f05751cdbf0314e0f1ecee9f7 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Tue, 12 Mar 2024 08:33:37 -0500
Subject: [PATCH 48/75] io crash from multi gpu case is fixed

---
 src/tps-bte_0d3v.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 6a07571fe..1300d4d69 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -1099,9 +1099,13 @@ def t1():
                     await ts
                     
                     await boltzmann.push(interface, use_interp=bte_use_interp)
+                    
                     for grid_idx in boltzmann.active_grid_idx:
-                        u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                        dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                        with cp.cuda.Device(dev_id):
+                            u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                            boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                    
                     
                 else:
                     assert boltzmann.param.solver_type == "transient", "unknown BTE solver type"
@@ -1128,8 +1132,6 @@ def t1():
                                     u0      = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                                     fname   = "%s_iter%04d_grid_%04d_cycle_%0d"%(boltzmann.param.out_fname, iter, grid_idx, bte_idx//bte_sper_cycle)
                                     
-                                    #boltzmann.io_output_data(grid_idx, u0, False, True, fname)
-                                    
                                     abs_error[grid_idx] = xp.max(xp.abs(bte_v[grid_idx]-u0))
                                     rel_error[grid_idx] = abs_error[grid_idx] / xp.max(xp.abs(u0))
                                     #print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E"%(bte_idx, tt_bte, abs_error[grid_idx], rel_error[grid_idx]))
@@ -1189,8 +1191,11 @@ def t1():
                     await boltzmann.push(interface, use_interp=bte_use_interp)
                     
                     for grid_idx in boltzmann.active_grid_idx:
-                        u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                        dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                        with cp.cuda.Device(dev_id):
+                            u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                            boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                    
                     
                 ################### tps solve ######################################
                 tps.fetch(interface)

From 3ef9e4639eef76e5f2a20e6b81253f5749d0fa9a Mon Sep 17 00:00:00 2001
From: milindasf <milinda@oden.utexas.edu>
Date: Wed, 13 Mar 2024 11:51:41 -0500
Subject: [PATCH 49/75] additional parameters + profile counters for tps-bte
 profiling

---
 src/tps-bte_0d3v.py | 263 ++++++++++++++++++++++----------------------
 1 file changed, 133 insertions(+), 130 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 1300d4d69..e165d5f17 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -102,6 +102,9 @@ class BoltzmannSolverParams():
     atol          = 1e-10       # absolute tolerance
     rtol          = 1e-10       # relative tolerance
     max_iter      = 1000        # max iterations for the newton solver
+    
+    tps_bte_max_iter = 5000     # max iterations for tps-bte split scheme
+    bte_solve_freq   = 100      # run bte every x tps cycles. 
 
     ee_collisions = 0           # enable electron-electron Coulombic effects
     use_gpu       = 1           # enable GPU use (1)-GPU solver, (0)-CPU solver
@@ -160,9 +163,6 @@ class TPSINDEX():
     
     MOLE_FRAC_IDX = {0: NEU_IDX, 1: EX1_IDX , 2: EX2_IDX , 3: EX3_IDX} 
     
-    
-                        
-    
 class Boltzmann0D2VBactchedSolver:
     
     def __init__(self, tps, comm):
@@ -179,14 +179,6 @@ def __init__(self, tps, comm):
            os.makedirs(boltzmann_dir)
            #print("directory %s is created!"%(dir_name))
            
-        profile_tt  = [None] * int(pp.LAST)
-        profile_nn  = ["setup", "solve", "last"]
-        for i in range(pp.LAST):
-            profile_tt[i] = profile_t(profile_nn[i])
-        
-        self.profile_tt = profile_tt
-        self.profile_nn = profile_nn
-        
         num_gpus_per_node = 1 
         if self.param.use_gpu==1:
             num_gpus_per_node = cp.cuda.runtime.getDeviceCount()
@@ -230,6 +222,10 @@ def __parse_config_file__(self, fname):
         self.param.threads          = int(config.get("boltzmannSolver", "threads").split("#")[0].strip())
         self.param.output_dir       = str(config.get("boltzmannSolver", "output_dir").split("#")[0].strip())
         self.param.out_fname        = self.param.output_dir + "/" + str(config.get("boltzmannSolver", "output_fname").split("#")[0].strip())
+        
+        self.param.bte_solve_freq   = int(config.get("boltzmannSolver", "bte_solve_freq").split("#")[0].strip())
+        self.param.tps_bte_max_iter = int(config.get("boltzmannSolver", "tps_bte_max_iter").split("#")[0].strip())
+        
         return 
     
     def grid_setup(self, interface):
@@ -240,8 +236,6 @@ def grid_setup(self, interface):
         computed from the TPS code. 
         """
         assert self.xp_module==np, "grid setup only supported in CPU"
-        self.profile_tt[pp.SETUP].start()
-        
         xp            = self.xp_module
         n_grids       = self.param.n_grids
         Te            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
@@ -301,7 +295,6 @@ def grid_setup(self, interface):
         # self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
         self.active_grid_idx  = [i for i in range(self.param.n_grids)]
         self.sub_clusters_run = False
-        self.profile_tt[pp.SETUP].stop()
         return
 
     def solve_wo_parla(self):
@@ -466,9 +459,9 @@ async def fetch(self, interface, use_interp:bool):
         Ey                 = EybyN * self.param.n0 * self.param.Td_fac
         
         ion_deg              = species_densities[TPSINDEX.ELE_IDX]/n0
-        ion_deg[ion_deg<0]   = 1e-16
-        ns_by_n0[ns_by_n0<0] = 0
-        m_bte              = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
+        ion_deg[ion_deg<=0]  = 1e-16
+        ns_by_n0[ns_by_n0<=0]= 0
+        m_bte                = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
         
         self.sub_cluster_idx_to_pidx = None
         self.sub_cluster_c           = None          
@@ -571,6 +564,16 @@ def t1():
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
                     self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
                     
+                    # cp.save(self.param.out_fname + "_ns_by_n0_%02d.npy"%(grid_idx) , ns_by_n0 , grid_idx)
+                    # cp.save(self.param.out_fname + "_n0_%02d.npy"%(grid_idx)       , n0       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ne_%02d.npy"%(grid_idx)       , ne       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ni_%02d.npy"%(grid_idx)       , ni       , grid_idx)
+                    
+                    # cp.save(self.param.out_fname + "_Tg_%02d.npy"%(grid_idx)  , Tg    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eRe_%02d.npy"%(grid_idx) , Ex    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eIm_%02d.npy"%(grid_idx) , Ey    , grid_idx)
+                    # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
+                    
                     return
                 
             await ts
@@ -629,6 +632,17 @@ def t1():
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", Ex)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
                     self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
+                    
+                    # cp.save(self.param.out_fname + "_ns_by_n0_%02d.npy"%(grid_idx) , ns_by_n0 , grid_idx)
+                    # cp.save(self.param.out_fname + "_n0_%02d.npy"%(grid_idx)       , n0       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ne_%02d.npy"%(grid_idx)       , ne       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ni_%02d.npy"%(grid_idx)       , ni       , grid_idx)
+                    
+                    # cp.save(self.param.out_fname + "_Tg_%02d.npy"%(grid_idx)  , Tg    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eRe_%02d.npy"%(grid_idx) , Ex    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eIm_%02d.npy"%(grid_idx) , Ey    , grid_idx)
+                    # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
+                    
                     return
             await ts
         return        
@@ -771,7 +785,6 @@ async def solve(self):
         if csv_write: 
             data_csv = np.empty((self.tps_npts, 8 + len(coll_list)))
             
-        self.profile_tt[pp.SOLVE].start()
         ts = TaskSpace("T")
         for grid_idx in self.active_grid_idx:
             @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
@@ -787,92 +800,6 @@ def t1():
                     sys.exit(-1)
                     
         await ts
-        self.profile_tt[pp.SOLVE].stop()
-        
-        
-        t1 = min_mean_max(self.profile_tt[pp.SETUP].seconds, self.comm)
-        t2 = min_mean_max(self.profile_tt[pp.SOLVE].seconds, self.comm)
-        print("[Boltzmann] setup (min) = %.4E (s) setup (mean) = %.4E (s) setup (max) = %.4E (s)" % (t1[0],t1[1],t1[2]))
-        print("[Boltzmann] solve (min) = %.4E (s) solve (mean) = %.4E (s) solve (max) = %.4E (s)" % (t2[0],t2[1],t2[2]))
-        
-        """
-        if (self.param.export_csv ==1 or self.param.plot_data==1):
-            for grid_idx in range(n_grids):
-                dev_id   = gidx_to_device_map(grid_idx, n_grids)
-                ff       = self.ff[grid_idx]
-                qoi      = self.qoi[grid_idx]
-                
-                def asnumpy(a):
-                    if cp.get_array_module(a)==cp:
-                        with cp.cuda.Device(dev_id):
-                            return cp.asnumpy(a)
-                    else:
-                        return a
-                
-                ff_cpu   = asnumpy(ff)
-                ev       = np.linspace(1e-3, self.bte_solver._par_ev_range[grid_idx][1], 500)
-                ff_r     = self.bte_solver.compute_radial_components(grid_idx, ev, ff_cpu)
-                
-                n0    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "n0"))
-                ne    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ne"))
-                ni    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "ni"))
-                Tg    = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "Tg"))
-                eRe   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe"))
-                eIm   = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
-                eMag  = np.sqrt(eRe**2 + eIm**2)
-                
-                if csv_write:
-                    data_csv[gidx_to_pidx_map[grid_idx], 0]    = n0
-                    data_csv[gidx_to_pidx_map[grid_idx], 1]    = ne
-                    data_csv[gidx_to_pidx_map[grid_idx], 2]    = ni
-                    data_csv[gidx_to_pidx_map[grid_idx], 3]    = Tg
-                    data_csv[gidx_to_pidx_map[grid_idx], 4]    = eMag
-                    data_csv[gidx_to_pidx_map[grid_idx], 5]    = asnumpy(qoi["energy"])
-                    data_csv[gidx_to_pidx_map[grid_idx], 6]    = asnumpy(qoi["mobility"])
-                    data_csv[gidx_to_pidx_map[grid_idx], 7]    = asnumpy(qoi["diffusion"])
-                    
-                    for col_idx, g in enumerate(coll_list):
-                        data_csv[gidx_to_pidx_map[grid_idx], 8 + col_idx]    = asnumpy(qoi["rates"][col_idx])
-
-                if plot_data:
-                    num_sh       = len(self.bte_solver._par_lm[grid_idx])
-                    num_subplots = num_sh 
-                    num_plt_cols = min(num_sh, 4)
-                    num_plt_rows = np.int64(np.ceil(num_subplots/num_plt_cols))
-                    fig          = plt.figure(figsize=(num_plt_cols * 8 + 0.5*(num_plt_cols-1), num_plt_rows * 8 + 0.5*(num_plt_rows-1)), dpi=300, constrained_layout=True)
-                    plt_idx      =  1
-                    n_pts_step   =  self.grid_idx_to_npts[grid_idx] // 20
-
-                    for lm_idx, lm in enumerate(self.bte_solver._par_lm[grid_idx]):
-                        plt.subplot(num_plt_rows, num_plt_cols, plt_idx)
-                        for ii in range(0, self.grid_idx_to_npts[grid_idx], n_pts_step):
-                            fr = np.abs(ff_r[ii, lm_idx, :])
-                            plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td], $n_e/n_0$ = %.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]))
-                        
-                        plt.xlabel(r"energy (eV)")
-                        plt.ylabel(r"$f_%d$"%(lm[0]))
-                        plt.grid(visible=True)
-                        if lm_idx==0:
-                            plt.legend(prop={'size': 6})
-                            
-                        plt_idx +=1
-                    
-                    plt.savefig("%s_plot_%02d.png"%(self.param.out_fname, grid_idx))
-                    plt.close()
-            
-            if csv_write:
-                fname    = self.param.out_fname
-                with open("%s_qoi.csv"%fname, 'w', encoding='UTF8') as f:
-                    writer = csv.writer(f,delimiter=',')
-                    # write the header
-                    header = ["n0", "ne", "ni", "Tg", "E",  "energy", "mobility", "diffusion"]
-                    for col_idx, g in enumerate(coll_list):
-                        header.append(str(coll_names[col_idx]))
-                    
-                    writer.writerow(header)
-                    writer.writerows(data_csv)
-        """
-
         return
     
     async def push(self, interface, use_interp:bool):
@@ -1028,19 +955,78 @@ def asnumpy(a):
             plt.close()
             
         return        
+
+class pp(enum.IntEnum):
+    BTE_SETUP     = 0
+    BTE_FETCH     = 1
+    BTE_SOLVE     = 2
+    BTE_PUSH      = 3
+    TPS_SETUP     = 4
+    TPS_FETCH     = 5
+    TPS_SOLVE     = 6
+    TPS_PUSH      = 7
+    LAST          = 8
+    
+profile_nn     = ["bte_setup", "bte_fetch", "bte_solve", "bte_push", "tps_setup", "tps_fetch", "tps_solve", "tps_push", "last"]
+profile_tt     = [profile_t(profile_nn[i]) for i in range(int(pp.LAST))]
+
+def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn, fname, comm):
+    
+    Nx = boltzmann.param.n_grids * boltzmann.param.n_sub_clusters
+    Nv = (boltzmann.param.Nr + 1) * (boltzmann.param.l_max + 1)
+    
+    tt = list()
+    for i in range(len(p_tt)):
+        tt.append(min_mean_max(p_tt[i].seconds/p_tt[i].iter, comm))
         
-            
+    header = [  "Nv",
+                "Nx",
+                "bte_setup_min", "bte_setup_mean", "bte_setup_max",
+                "bte_fetch_min", "bte_fetch_mean", "bte_fetch_max",
+                "bte_solve_min", "bte_solve_mean", "bte_solve_max",
+                "bte_push_min" , "bte_push_mean" , "bte_push_max",
+                
+                "tps_setup_min", "tps_setup_mean", "tps_setup_max",
+                "tps_fetch_min", "tps_fetch_mean", "tps_fetch_max",
+                "tps_solve_min", "tps_solve_mean", "tps_solve_max",
+                "tps_push_min" , "tps_push_mean" , "tps_push_max"]
+    
+    data   = [  Nv,
+                Nx,
+                tt[pp.BTE_SETUP][0], tt[pp.BTE_SETUP][1], tt[pp.BTE_SETUP][2], 
+                tt[pp.BTE_FETCH][0], tt[pp.BTE_FETCH][1], tt[pp.BTE_FETCH][2], 
+                tt[pp.BTE_SOLVE][0], tt[pp.BTE_SOLVE][1], tt[pp.BTE_SOLVE][2], 
+                tt[pp.BTE_PUSH][0] , tt[pp.BTE_PUSH][1] , tt[pp.BTE_PUSH][2] ,
+                tt[pp.TPS_SETUP][0], tt[pp.TPS_SETUP][1], tt[pp.TPS_SETUP][2], 
+                tt[pp.TPS_FETCH][0], tt[pp.TPS_FETCH][1], tt[pp.TPS_FETCH][2], 
+                tt[pp.TPS_SOLVE][0], tt[pp.TPS_SOLVE][1], tt[pp.TPS_SOLVE][2], 
+                tt[pp.TPS_PUSH][0] , tt[pp.TPS_PUSH][1] , tt[pp.TPS_PUSH][2] ]
+    
+    data_str= ["%.4E"%d for d in data]
+    if fname!="":
+        with open(fname, "a") as f:
+            f.write(",".join(header)+"\n")
+            f.write(",".join(data_str)+"\n")
+            f.close()
+    else:
+        print(",".join(header))
+        print(",".join(data_str))
+                
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
     
     with Parla():
         # TPS solver
+        profile_tt[pp.TPS_SETUP].start()
+        
         tps = libtps.Tps(comm)
         tps.parseCommandLineArgs(sys.argv)
         tps.parseInput()
         tps.chooseDevices()
         tps.chooseSolver()
         tps.initialize()
+        
+        profile_tt[pp.TPS_SETUP].stop()
 
         boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
         interface = libtps.Tps2Boltzmann(tps)
@@ -1049,7 +1035,10 @@ def asnumpy(a):
         #coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
         tps.solveBegin()
         tps.push(interface)
+        
+        profile_tt[pp.BTE_SETUP].start()
         boltzmann.grid_setup(interface)
+        profile_tt[pp.BTE_SETUP].stop()
         
         @spawn(placement=cpu, vcus=0)
         async def __main__():
@@ -1058,7 +1047,7 @@ async def __main__():
             await boltzmann.solve_init(bte_use_interp)
             xp = boltzmann.bte_solver.xp_module
 
-            max_iters = tps.getRequiredInput("cycle-avg-joule-coupled/max-iters")
+            max_iters = boltzmann.param.tps_bte_max_iter
             iter      = 0
             tt        = 0#interface.currentTime()
             tau       = (1/boltzmann.param.Efreq)
@@ -1074,7 +1063,7 @@ async def __main__():
             tps_sper_cycle = int(xp.ceil(tau/dt_tps))
             bte_sper_cycle = int(xp.ceil(tau/dt_bte))
             bte_max_cycles = int(boltzmann.param.cycles)
-            tps_max_cycles = 100
+            tps_max_cycles = boltzmann.param.bte_solve_freq
             
             print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
             tps.solveStep()
@@ -1087,26 +1076,32 @@ async def __main__():
                     interface.saveDataCollection(cycle=(iter//cycle_freq), time=iter)
                 
                 ########################## BTE solve ##################################################
+                profile_tt[pp.BTE_FETCH].start()
                 await boltzmann.fetch(interface, use_interp=bte_use_interp)
+                profile_tt[pp.BTE_FETCH].stop()
                 
                 if (boltzmann.param.solver_type=="steady-state"):
+                    profile_tt[pp.BTE_SOLVE].start()
                     await boltzmann.solve()
+                    profile_tt[pp.BTE_SOLVE].stop()
+                    
+                    
+                    profile_tt[pp.BTE_PUSH].start()
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                         def t1():
                             boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", boltzmann.ff[grid_idx])
                     await ts
-                    
                     await boltzmann.push(interface, use_interp=bte_use_interp)
+                    profile_tt[pp.BTE_PUSH].stop()
                     
-                    for grid_idx in boltzmann.active_grid_idx:
-                        dev_id  = gidx_to_device_map(grid_idx,n_grids)
-                        with cp.cuda.Device(dev_id):
-                            u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                            boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
-                    
-                    
+                    if boltzmann.param.export_csv ==1:
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                            with cp.cuda.Device(dev_id):
+                                u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
                 else:
                     assert boltzmann.param.solver_type == "transient", "unknown BTE solver type"
                     """
@@ -1142,7 +1137,7 @@ def t1():
                                     bte_v[grid_idx] = xp.copy(u0)
                                     
                             await ts
-                            p_t3 = min_mean_max(p_t2-p_t1, comm)
+                            p_t3 = min_mean_max(profile_tt[pp.BTE_SOLVE].snap, comm)
                             print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]))
                             
                             if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
@@ -1161,12 +1156,12 @@ def t1():
                                 u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                         await ts
                         
-                        p_t1 = time()
+                        profile_tt[pp.BTE_SOLVE].start()
                         await boltzmann.solve_step(tt_bte, dt_bte)
-                        p_t2 = time()
+                        profile_tt[pp.BTE_SOLVE].stop()
                         
                         if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
-                            p_t3 = min_mean_max(p_t2-p_t1, comm)
+                            p_t3 = min_mean_max(profile_tt[pp.BTE_SOLVE].snap, comm)
                             print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte, p_t3[0], p_t3[1], p_t3[2]))
                         
                         ts = TaskSpace("T")
@@ -1179,6 +1174,7 @@ def t1():
                     
                         tt_bte += dt_bte
                     
+                    profile_tt[pp.BTE_PUSH].start()
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
                         @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
@@ -1189,16 +1185,22 @@ def t1():
                             boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
                     await ts
                     await boltzmann.push(interface, use_interp=bte_use_interp)
+                    profile_tt[pp.BTE_PUSH].stop()
                     
-                    for grid_idx in boltzmann.active_grid_idx:
-                        dev_id  = gidx_to_device_map(grid_idx,n_grids)
-                        with cp.cuda.Device(dev_id):
-                            u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                            boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                    if boltzmann.param.export_csv ==1:
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                            with cp.cuda.Device(dev_id):
+                                u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                
                     
                     
                 ################### tps solve ######################################
+                profile_tt[pp.TPS_FETCH].start()
                 tps.fetch(interface)
+                profile_tt[pp.TPS_FETCH].stop()
+                
                 tps_u  = 0
                 tps_v  = 0
                 tt_tps = 0
@@ -1207,7 +1209,6 @@ def t1():
                 p_t2   = 0
                 for tps_idx in range(tps_sper_cycle * tps_max_cycles + 1):
                     if (tps_idx % tps_sper_cycle == 0):
-                        
                         tps.push(interface)
                         nspecies            = interface.Nspecies()
                         heavy_temp          = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
@@ -1220,7 +1221,7 @@ def t1():
                         rel_error           = abs_error / np.linalg.norm(tps_u, axis=1)
                         tps_v               = np.copy(tps_u)
                         
-                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
                         print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]))
                         # if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
                         #     break
@@ -1228,19 +1229,21 @@ def t1():
                     if (tps_idx == tps_sper_cycle * tps_max_cycles):
                         break
                     
-                    p_t1 = time()
+                    profile_tt[pp.TPS_SOLVE].start()
                     tps.solveStep()
-                    p_t2 = time()
+                    profile_tt[pp.TPS_SOLVE].stop()
                     if(terminal_output_freq > 0 and tps_idx % terminal_output_freq ==0):
-                        p_t3 = min_mean_max(p_t2-p_t1, comm)
+                        p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
                         print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps, p_t3[0],p_t3[1],p_t3[2]))
                     tt_tps +=dt_tps
                 
+                profile_tt[pp.TPS_PUSH].start()
                 tps.push(interface)
+                profile_tt[pp.TPS_PUSH].stop()
                 
                 tt += dt_tps * tps_idx
                 iter+=1
         
-
+            profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
     tps.solveEnd()
     sys.exit (tps.getStatus())
\ No newline at end of file

From 8f012b8d92aaafd9c69e0076571762446da74794 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Fri, 15 Mar 2024 22:45:00 -0500
Subject: [PATCH 50/75] ibrun -np 2 debuging code, it seems to me the crash
 happens when launching CPU parla tasks in the async fetch

---
 src/tps-bte_0d3v.py | 531 +++++++++++++++++++++++++++++---------------
 1 file changed, 353 insertions(+), 178 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index e165d5f17..0972275a7 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
+from mpi4py import MPI
 import sys
 import os
-from mpi4py import MPI
 import numpy as np
 import scipy.constants
 import csv
@@ -13,6 +13,7 @@
 import pandas as pd
 import scipy.interpolate
 import scipy.cluster
+
 class profile_t:
     def __init__(self,name):
         self.name = name
@@ -50,19 +51,6 @@ def reset(self):
 def min_mean_max(a, comm: MPI.Comm):
     return (comm.allreduce(a, MPI.MIN) , comm.allreduce(a, MPI.SUM)/comm.Get_size(), comm.allreduce(a, MPI.MAX))
 
-# try:
-#     df    = pd.read_csv("ionization_rates.csv")
-#     Te    = np.array(df["Te[K]"]) 
-#     r_arr = np.array(df["Arr[m3/s]"])
-#     r_csc = np.array(df["CSC_Maxwellian[m3/s]"])
-#     r_arr = scipy.interpolate.interp1d(Te, r_arr,bounds_error=False, fill_value=0.0)
-#     r_csc = scipy.interpolate.interp1d(Te, r_csc,bounds_error=False, fill_value=0.0)
-#     print("ionization coefficient read from file ")
-# except:
-#     print("ionization rate coefficient file not found!!")
-#     r_arr = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
-#     r_csc = lambda Te : 1.235e-13 * np.exp(-18.687 / np.abs(Te * scipy.constants.Boltzmann/scipy.constants.electron_volt))
-
 # set path to C++ TPS library
 path = os.path.abspath(os.path.dirname(sys.argv[0]))
 sys.path.append(path + "/.libs")
@@ -82,11 +70,6 @@ def min_mean_max(a, comm: MPI.Comm):
         sys.exit(0)
 
 
-class pp(enum.IntEnum):
-    SETUP         = 0
-    SOLVE         = 1
-    LAST          = 2
-
 class BoltzmannSolverParams():
     sp_order      = 3           # B-spline order in v-space
     spline_qpts   = 5           # number of Gauss-Legendre quadrature points per knot interval    
@@ -135,7 +118,7 @@ class BoltzmannSolverParams():
     n0            = 3.22e22 #[m^{-3}]
     
     rand_seed     = 0
-    
+    use_clstr_inp = True
     
 class TPSINDEX():
     """
@@ -168,6 +151,10 @@ class Boltzmann0D2VBactchedSolver:
     def __init__(self, tps, comm):
         self.tps   = tps
         self.comm : MPI.Comm  = comm
+        
+        self.rankG = self.comm.Get_rank()
+        self.npesG = self.comm.Get_size()
+        
         self.param = BoltzmannSolverParams()
         # overide the default params, based on the config.ini file.
         self.__parse_config_file__(sys.argv[2])
@@ -175,16 +162,19 @@ def __init__(self, tps, comm):
         boltzmann_dir           = self.param.output_dir
         isExist = os.path.exists(boltzmann_dir)
         if not isExist:
-           # Create a new directory because it does not exist
            os.makedirs(boltzmann_dir)
-           #print("directory %s is created!"%(dir_name))
            
         num_gpus_per_node = 1 
         if self.param.use_gpu==1:
             num_gpus_per_node = cp.cuda.runtime.getDeviceCount()
         
+        self.num_gpus_per_node = num_gpus_per_node 
+        
+        if self.rankG==0:
+            print("number of GPUs detected = %d "%(num_gpus_per_node), flush=True)
+        
         # how to map each grid to the GPU devices on the node
-        self.gidx_to_device_map = lambda gidx, num_grids : gidx % num_gpus_per_node
+        self.gidx_to_device_map = lambda gidx, num_grids : self.rankG % self.num_gpus_per_node #gidx % num_gpus_per_node
         return
     
     def __parse_config_file__(self, fname):
@@ -193,7 +183,9 @@ def __parse_config_file__(self, fname):
         which overides the default BoltzmannSolverParams
         """
         config = configparser.ConfigParser()
-        print("[Boltzmann] reading configure file given by : ", fname)
+        if self.rankG==0:
+            print("[Boltzmann] reading configure file given by : ", fname, flush=True)
+            
         config.read(fname)
         
         self.param.sp_order         = int(config.get("boltzmannSolver", "sp_order").split("#")[0].strip())
@@ -241,11 +233,18 @@ def grid_setup(self, interface):
         Te            = xp.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False) / self.param.ev_to_K # [eV]
         
         Tew           = scipy.cluster.vq.whiten(Te)
+        Tecw0         = Tew[np.random.choice(Tew.shape[0], self.param.n_grids, replace=False)]
         Tecw          = scipy.cluster.vq.kmeans(Tew, np.linspace(np.min(Tew), np.max(Tew), n_grids), iter=1000, thresh=1e-8, check_finite=False)[0]
-        Te_b          = Tecw * np.std(Te, axis=0)
+        
+        Tecw0[0:len(Tecw)] = Tecw[:]
+        Tecw               = Tecw0
+        assert len(Tecw0) == self.param.n_grids
+        
+        Te_b          = xp.sort(Tecw * np.std(Te, axis=0))
         dist_mat      = xp.zeros((len(Te),n_grids))
         
-        print("K-means Te clusters ", Te_b)                
+        
+        print("rank [%d/%d] : K-means Te clusters "%(self.rankG, self.npesG), Te_b, flush=True)
         for i in range(self.param.n_grids):
             dist_mat[:,i] = xp.abs(Tew-Tecw[i])
         
@@ -254,12 +253,12 @@ def grid_setup(self, interface):
         for b_idx in range(self.param.n_grids):
             grid_idx_to_spatial_pts_map.append(xp.argwhere(membership==b_idx)[:,0]) 
         
-        np.save("%s_gidx_to_pidx.npy"%(self.param.out_fname), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
+        #np.save("%s_gidx_to_pidx_rank_%08d.npy"%(self.param.out_fname, self.rankG), np.array(grid_idx_to_spatial_pts_map, dtype=object), allow_pickle=True)
         
         self.grid_idx_to_npts            = xp.array([len(a) for a in grid_idx_to_spatial_pts_map], dtype=xp.int32)
         self.grid_idx_to_spatial_idx_map = grid_idx_to_spatial_pts_map
         
-        xp.sum(self.grid_idx_to_npts) == len(Te), "[Error] : TPS spatial points for v-space grid assignment is inconsitant"
+        xp.sum(self.grid_idx_to_npts)    == len(Te), "[Error] : TPS spatial points for v-space grid assignment is inconsitant"
         lm_modes                         = [[[l,0] for l in range(self.param.l_max+1)] for grid_idx in range(self.param.n_grids)]
         nr                               = xp.ones(self.param.n_grids, dtype=np.int32) * self.param.Nr
         Te                               = xp.array([Te_b[b_idx]  for b_idx in range(self.param.n_grids)]) # xp.ones(self.param.n_grids) * self.param.Te 
@@ -267,12 +266,8 @@ def grid_setup(self, interface):
         ev_max                           = (6 * vth / self.param.c_gamma)**2 
         self.bte_solver                  = BoltzmannSolver(self.param, ev_max , Te , nr, lm_modes, self.param.n_grids, self.param.collisions)
 
-        if self.param.verbose==1:
-            print("grid energy max (eV) \n", ev_max, flush = True)
-        
         # compute BTE operators
         for grid_idx in range(self.param.n_grids):
-            print("setting up grid %d"%(grid_idx), flush = True)
             self.bte_solver.assemble_operators(grid_idx)
             
         n_grids              = self.param.n_grids
@@ -280,11 +275,12 @@ def grid_setup(self, interface):
 
         for grid_idx in range(n_grids):
             assert self.grid_idx_to_npts[grid_idx] > 0
-
-            print("setting initial Maxwellian at %.4E eV" %(self.bte_solver._par_ap_Te[grid_idx]), flush=True)
-            self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian"))
-        
+            if(self.param.use_clstr_inp==True):
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.param.n_sub_clusters      , "maxwellian"))
+            else:
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "f_mw", self.bte_solver.initialize(grid_idx, self.grid_idx_to_npts[grid_idx], "maxwellian"))
         
+        self.comm.Barrier()
         # active_grid_idx=list()
         # for grid_idx in range(n_grids):
         #     spec_sp     = self.bte_solver._op_spec_sp[grid_idx]
@@ -326,7 +322,7 @@ def solve_wo_parla(self):
                     self.qoi[grid_idx] = qoi
                     self.ff [grid_idx] = ff
                 except:
-                    print("solver failed for v-space gird no %d"%(grid_idx))
+                    print("solver failed for v-space gird no %d"%(grid_idx), flush=True)
                     sys.exit(-1)
             else:
                 with xp.cuda.Device(dev_id):
@@ -336,11 +332,11 @@ def solve_wo_parla(self):
                         self.qoi[grid_idx] = qoi
                         self.ff [grid_idx] = ff
                     except:
-                        print("solver failed for v-space gird no %d"%(grid_idx))
+                        print("solver failed for v-space gird no %d"%(grid_idx), flush=True)
                         sys.exit(-1)
                     
         t2 = time()
-        print("time for boltzmann v-space solve = %.4E"%(t2- t1))
+        print("time for boltzmann v-space solve = %.4E"%(t2- t1), flush=True)
         
         if (self.param.export_csv ==1 or self.param.plot_data==1):
             for grid_idx in range(n_grids):
@@ -420,7 +416,8 @@ def asnumpy(a):
 
         return
     
-    async def fetch(self, interface, use_interp:bool):
+    def fetch(self, interface):
+        use_interp              = self.param.use_clstr_inp
         gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
         heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
         tps_npts                = len(heavy_temp)
@@ -429,21 +426,10 @@ async def fetch(self, interface, use_interp:bool):
         electron_temp           = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
         efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
         species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
-        
         cs_avail_species        = self.bte_solver._avail_species
-        
         n0                      = np.sum(species_densities, axis=0) - species_densities[TPSINDEX.ELE_IDX]
         ns_by_n0                = np.concatenate([species_densities[TPSINDEX.MOLE_FRAC_IDX[i]]/n0 for i in range(len(cs_avail_species))]).reshape((len(cs_avail_species), tps_npts))
         
-        # np.save("n0.npy", species_densities[TPSINDEX.NEU_IDX])
-        # np.save("ne.npy", species_densities[TPSINDEX.ELE_IDX])
-        # np.save("ni.npy", species_densities[TPSINDEX.ION_IDX])
-        
-        # np.save("Te.npy", heavy_temp)
-        # np.save("Tg.npy", heavy_temp)
-        # np.save("E.npy" , np.sqrt(efield[0]**2 + efield[1]**2))
-        # sys.exit(-1)
-        
         n_grids            = self.param.n_grids 
         use_gpu            = self.param.use_gpu
         
@@ -477,9 +463,7 @@ def normalize(obs):
                 std_obs[std_obs == 0.0] = 1.0
                 return obs/std_obs, std_obs
 
-            ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
-                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
                 def t1():
                     dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
@@ -501,26 +485,6 @@ def t1():
                     for c_idx in range(n_sub_clusters):
                         self.sub_cluster_idx_to_pidx[grid_idx][c_idx] = np.argwhere(membership_m==c_idx)[:,0]
                         
-                        # idx     = self.sub_cluster_idx_to_pidx[grid_idx][c_idx]
-                        # abs_err = np.linalg.norm(dist_mat[idx, c_idx] - np.linalg.norm(mw[idx] - mcw[c_idx], axis=1))
-                        # print(grid_idx, c_idx, abs_err)
-                    
-                    # dw_mat = np.zeros(self.param.n_sub_clusters)
-                    # print(grid_idx,"\n" , mc)
-                    # for c_idx in range(n_sub_clusters):
-                    #     idx = self.sub_cluster_idx_to_pidx[grid_idx][c_idx]
-                    #     if len(idx>0):
-                    #         dw_mat[c_idx] =  np.max(np.linalg.norm(1 - m[idx] / mc[c_idx], axis = 1))
-                    
-                    # plt.figure(figsize=(8, 8), dpi=300)
-                    # plt.semilogy(np.array(range(self.param.n_sub_clusters)), dw_mat)
-                    # plt.xlabel(r"cluster id")
-                    # plt.ylabel(r"relative error")
-                    # plt.grid(visible=True)
-                    # plt.savefig("%s_grid_idx_%04d.png"%(self.param.out_fname, grid_idx))
-                    # plt.close()
-                    
-                    
                     n0       = np.ones(mc.shape[0]) * self.param.n0
                     Ex       = mc[: , 0] * self.param.n0 * self.param.Td_fac
                     Ey       = mc[: , 1] * self.param.n0 * self.param.Td_fac
@@ -531,7 +495,7 @@ def t1():
                     EMag     = np.sqrt(Ex**2 + Ey**2)
                     
                     if self.param.verbose == 1 :
-                        print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                        print("rank = %d Boltzmann solver inputs for v-space grid id %d"%(self.rankG, grid_idx))
                         print("Efreq = %.4E [1/s]" %(self.param.Efreq))
                         print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                         
@@ -575,13 +539,10 @@ def t1():
                     # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
                     
                     return
-                
-            await ts
+                t1()
             
         else:
-            ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
-                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
                 def t1():
                     bte_idx           = gidx_to_pidx[grid_idx]
                     dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
@@ -600,7 +561,7 @@ def t1():
                     EMag              = np.sqrt(Ex**2 + Ey**2)
                 
                     if self.param.verbose == 1 :
-                        print("Boltzmann solver inputs for v-space grid id %d"%(grid_idx))
+                        print("rank = %d Boltzmann solver inputs for v-space grid id %d"%(self.rankG, grid_idx))
                         print("Efreq = %.4E [1/s]" %(self.param.Efreq))
                         print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
                         
@@ -613,6 +574,209 @@ def t1():
                             print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])))
             
             
+                    if (use_gpu == 1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
+                    
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", Ex)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
+                    
+                    # cp.save(self.param.out_fname + "_ns_by_n0_%02d.npy"%(grid_idx) , ns_by_n0 , grid_idx)
+                    # cp.save(self.param.out_fname + "_n0_%02d.npy"%(grid_idx)       , n0       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ne_%02d.npy"%(grid_idx)       , ne       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ni_%02d.npy"%(grid_idx)       , ni       , grid_idx)
+                    
+                    # cp.save(self.param.out_fname + "_Tg_%02d.npy"%(grid_idx)  , Tg    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eRe_%02d.npy"%(grid_idx) , Ex    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eIm_%02d.npy"%(grid_idx) , Ey    , grid_idx)
+                    # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
+                    
+                    return
+                t1()
+        return        
+
+    async def fetch_asnyc(self, interface):
+        use_interp              = self.param.use_clstr_inp
+        gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
+        heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts                = len(heavy_temp)
+        self.tps_npts           = tps_npts
+        nspecies                = interface.Nspecies()
+        electron_temp           = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
+        efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
+        species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
+        
+        cs_avail_species        = self.bte_solver._avail_species
+        
+        n0                      = np.sum(species_densities, axis=0) - species_densities[TPSINDEX.ELE_IDX]
+        ns_by_n0                = np.concatenate([species_densities[TPSINDEX.MOLE_FRAC_IDX[i]]/n0 for i in range(len(cs_avail_species))]).reshape((len(cs_avail_species), tps_npts))
+        
+        n_grids                 = self.param.n_grids 
+        use_gpu                 = self.param.use_gpu
+        
+        Tg                      = heavy_temp
+        
+        Ex                      = efield[0]
+        Ey                      = efield[1]
+    
+        ExbyN                   = Ex/n0/self.param.Td_fac
+        EybyN                   = Ey/n0/self.param.Td_fac
+        
+        Ex                      = ExbyN * self.param.n0 * self.param.Td_fac
+        Ey                      = EybyN * self.param.n0 * self.param.Td_fac
+        
+        ion_deg                 = species_densities[TPSINDEX.ELE_IDX]/n0
+        ion_deg[ion_deg<=0]     = 1e-16
+        ns_by_n0[ns_by_n0<=0]   = 0
+        m_bte                   = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
+        
+        self.sub_cluster_idx_to_pidx = None
+        self.sub_cluster_c           = None          
+        gidx_to_device_map           = self.gidx_to_device_map
+        
+        if (use_interp == True):
+            n_sub_clusters               = self.param.n_sub_clusters
+            self.sub_cluster_idx_to_pidx = [[None for i in range(n_sub_clusters)] for i in range(self.param.n_grids)]
+            self.sub_cluster_c           = [None for i in range(self.param.n_grids)]
+            
+            def normalize(obs):
+                std_obs   = np.std(obs, axis=0)
+                std_obs[std_obs == 0.0] = 1.0
+                return obs/std_obs, std_obs
+
+            ts = TaskSpace("T")
+            for grid_idx in self.active_grid_idx:
+                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                def t1():
+                    dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
+                    m                            = m_bte[gidx_to_pidx[grid_idx]]
+                    mw , mw_std                  = normalize(m)
+                    
+                    # to repoduce clusters
+                    np.random.seed(self.param.rand_seed)
+                    
+                    mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
+                    mcw0[0:mcw.shape[0], :]      = mcw[:,:]
+                    mcw                          = mcw0
+                    
+                    assert mcw.shape[0] == self.param.n_sub_clusters
+                    
+                    mc                           = mcw * mw_std
+                    dist_mat                     = np.array([np.linalg.norm(mw - mcw[i], axis=1) for i in range(n_sub_clusters)]).T
+                    membership_m                 = np.argmin(dist_mat, axis=1)
+                    self.sub_cluster_c[grid_idx] = mc
+                    
+                    for c_idx in range(n_sub_clusters):
+                        self.sub_cluster_idx_to_pidx[grid_idx][c_idx] = np.argwhere(membership_m==c_idx)[:,0]
+                    
+                    
+                        
+                    n0       = np.ones(mc.shape[0]) * self.param.n0
+                    Ex       = mc[: , 0] * self.param.n0 * self.param.Td_fac
+                    Ey       = mc[: , 1] * self.param.n0 * self.param.Td_fac
+                    Tg       = mc[: , 2]
+                    ne       = mc[: , 3] * self.param.n0
+                    ni       = mc[: , 3] * self.param.n0
+                    ns_by_n0 = np.transpose(mc[: , 4:])
+                    EMag     = np.sqrt(Ex**2 + Ey**2)
+                    
+                    if self.param.verbose == 1 :
+                        print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
+                        
+                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)       , np.max(n0))   , flush=True)
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN)    , np.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN)    , np.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)       , np.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)       , np.max(ne))   , flush=True)
+                        
+                        for i in range(ns_by_n0.shape[0]):
+                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])), flush=True)
+                                            
+                    if (use_gpu==1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
+                    
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)    
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ne" , ne)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "ni" , ni)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "Tg" , Tg)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eRe", Ex)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "eIm", Ey)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx,  "E" , EMag)
+                    
+                    # cp.save(self.param.out_fname + "_ns_by_n0_%02d.npy"%(grid_idx) , ns_by_n0 , grid_idx)
+                    # cp.save(self.param.out_fname + "_n0_%02d.npy"%(grid_idx)       , n0       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ne_%02d.npy"%(grid_idx)       , ne       , grid_idx)
+                    # cp.save(self.param.out_fname + "_ni_%02d.npy"%(grid_idx)       , ni       , grid_idx)
+                    
+                    # cp.save(self.param.out_fname + "_Tg_%02d.npy"%(grid_idx)  , Tg    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eRe_%02d.npy"%(grid_idx) , Ex    , grid_idx)
+                    # cp.save(self.param.out_fname + "_eIm_%02d.npy"%(grid_idx) , Ey    , grid_idx)
+                    # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
+                    
+                    return
+                
+            await ts
+            
+        else:
+            ts = TaskSpace("T")
+            for grid_idx in self.active_grid_idx:
+                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                def t1():
+                    bte_idx           = gidx_to_pidx[grid_idx]
+                    dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+                    
+                    mc                = m_bte[bte_idx]
+                    
+                    n0                = np.ones(mc.shape[0]) * self.param.n0
+                    Ex                = mc[: , 0] * self.param.n0 * self.param.Td_fac
+                    Ey                = mc[: , 1] * self.param.n0 * self.param.Td_fac
+                    
+                    Tg                = mc[: , 2]
+                    ne                = mc[: , 3] * self.param.n0
+                    ni                = mc[: , 3] * self.param.n0
+                    ns_by_n0          = np.transpose(mc[: , 4:])
+                    
+                    EMag              = np.sqrt(Ex**2 + Ey**2)
+                
+                    if self.param.verbose == 1 :
+                        print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
+                        
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne))   , flush=True)
+                        
+                        for i in range(ns_by_n0.shape[0]):
+                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])), flush=True)
+            
+            
                     if (use_gpu == 1):
                         with cp.cuda.Device(dev_id):
                             n0   = cp.array(n0) 
@@ -647,7 +811,7 @@ def t1():
             await ts
         return        
 
-    async def solve_init(self, use_interp:bool):
+    async def solve_init_async(self):
         rank                    = self.comm.Get_rank()
         npes                    = self.comm.Get_size()
         n_grids                 = self.param.n_grids
@@ -658,7 +822,7 @@ async def solve_init(self, use_interp:bool):
             @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
             def t1():
                 dev_id = gidx_to_device_map(grid_idx, n_grids)
-                print("[%d/%d] setting grid %d to device %d"%(rank, npes, grid_idx, dev_id))
+                print("rank [%d/%d] setting grid %d to device %d"%(rank, npes, grid_idx, dev_id), flush=True)
                 self.bte_solver.host_to_device_setup(dev_id, grid_idx)
             
         await ts
@@ -666,16 +830,14 @@ def t1():
         def ts_op_setup(grid_idx):
             xp                                      = self.xp_module 
             f_mw                                    = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
-            
-            if (use_interp==True):
-                n_pts                               = self.param.n_sub_clusters
-            else:
-                n_pts                               = f_mw.shape[1]
-                
+            n_pts                                   = f_mw.shape[1]
             Qmat                                    = self.bte_solver._op_qmat[grid_idx]
             INr                                     = xp.eye(Qmat.shape[1])
             self.bte_solver._op_imat_vx[grid_idx]   = xp.einsum("i,jk->ijk",xp.ones(n_pts), INr)
             
+            if self.param.use_clstr_inp==True:
+                assert n_pts == self.param.n_sub_clusters
+            
         if(self.param.use_gpu==1):
             self.xp_module = cp
             
@@ -694,12 +856,7 @@ def t1():
                     f_mw         = f_mw/cp.dot(mm_op, f_mw)
                     f_mw         = cp.dot(qA.T, f_mw)
                     
-                    if (use_interp==True):
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw[: , 0:self.param.n_sub_clusters]))
-                    else:
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
-                    
-            
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
             await ts
         else:
             self.xp_module = np
@@ -716,22 +873,18 @@ def t1():
                     f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
                     f_mw         = f_mw/np.dot(mm_op, f_mw)
                     f_mw         = np.dot(qA.T, f_mw)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw))
                     
-                    if (use_interp==True):
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw[: , 0:self.param.n_sub_clusters]))
-                    else:
-                        self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw))
-            
             await ts
         
         return
             
-    async def solve_step(self, time, delta_t):
+    async def solve_step_async(self, time, delta_t):
         """
         perform a single timestep in 0d-BTE
         """
-        rank                    = self.comm.Get_rank()
-        npes                    = self.comm.Get_size()
+        rank                    = self.rankG
+        npes                    = self.npesG
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
         
@@ -755,12 +908,12 @@ def t1():
         
         return 
     
-    async def solve(self):
+    async def solve_async(self):
         """
         Can be used to compute steady-state or cycle averaged BTE solutions
         """
-        rank                    = self.comm.Get_rank()
-        npes                    = self.comm.Get_size()
+        rank                    = self.rankG
+        npes                    = self.npesG
         xp                      = self.xp_module
         csv_write               = self.param.export_csv
         plot_data               = self.param.plot_data
@@ -774,6 +927,7 @@ async def solve(self):
         self.qoi                = [None for grid_idx in range(self.param.n_grids)]
         self.ff                 = [None for grid_idx in range(self.param.n_grids)]
         num_gpus                = len(gpu)
+        assert num_gpus         == self.num_gpus_per_node, "CuPy and Parla number of GPUs per node does not match %d vs. %d"%(num_gpus, self.num_gpus_per_node)
         coll_list               = self.bte_solver.get_collision_list()
         coll_names              = self.bte_solver.get_collision_names()
         
@@ -790,23 +944,24 @@ async def solve(self):
             @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
             def t1():
                 try:
-                    print("[Boltzmann] %d / %d launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]))
+                    print("rank [%d/%d] BTE launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]), flush=True)
                     f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                     self.ff[grid_idx]  = ff
                     self.qoi[grid_idx] = qoi
                 except:
-                    print("solver failed for v-space gird no %d"%(grid_idx))
+                    print("rank [%d/%d] solver failed for v-space gird no %d"%(self.rankG, self.npesG, grid_idx), flush=True)
                     sys.exit(-1)
                     
         await ts
         return
     
-    async def push(self, interface, use_interp:bool):
+    async def push_async(self, interface):
         xp                      = self.xp_module
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
         gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        use_interp              = self.param.use_clstr_inp
         
         heavy_temp  = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
         tps_npts    = len(heavy_temp)
@@ -821,10 +976,10 @@ async def push(self, interface, use_interp:bool):
                 for grid_idx in self.active_grid_idx:
                     @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                     def t1():
-                        qA        = boltzmann.bte_solver._op_diag_dg[grid_idx]
-                        u0        = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        h_curr    = boltzmann.bte_solver.normalized_distribution(grid_idx, u0)
-                        qoi       = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        qA        = self.bte_solver._op_diag_dg[grid_idx]
+                        u0        = self.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        h_curr    = self.bte_solver.normalized_distribution(grid_idx, u0)
+                        qoi       = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
                         rr_cpu    = xp.asnumpy(qoi["rates"])
                         
                         rr_interp = np.zeros(len(gidx_to_pidx_map[grid_idx]))
@@ -844,10 +999,10 @@ def t1():
                 for grid_idx in self.active_grid_idx:
                     @spawn(ts[grid_idx], placement=[gpu(gidx_to_device_map(grid_idx,n_grids))], vcus=0.0)
                     def t1():
-                        qA       = boltzmann.bte_solver._op_diag_dg[grid_idx]
-                        u0       = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                        h_curr   = boltzmann.bte_solver.normalized_distribution(grid_idx, u0)
-                        qoi      = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        qA       = self.bte_solver._op_diag_dg[grid_idx]
+                        u0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        h_curr   = self.bte_solver.normalized_distribution(grid_idx, u0)
+                        qoi      = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
                         rr_cpu   = xp.asnumpy(qoi["rates"])
                         
                         for r_idx in range(n_reactions):
@@ -865,9 +1020,9 @@ def io_output_data(self, grid_idx, u0, plot_data:bool, export_csv:bool, fname:st
         dev_id                  = gidx_to_device_map(grid_idx, n_grids)
         qA                      = self.bte_solver._op_diag_dg[grid_idx]
         h_curr                  = xp.dot(qA, u0)
-        h_curr                  = boltzmann.bte_solver.normalized_distribution(grid_idx, h_curr)
+        h_curr                  = self.bte_solver.normalized_distribution(grid_idx, h_curr)
         ff                      = h_curr
-        qoi                     = boltzmann.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+        qoi                     = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
         coll_list               = self.bte_solver.get_collision_list()
         coll_names              = self.bte_solver.get_collision_names()
         cs_data                 = self.bte_solver.get_cross_section_data()
@@ -1011,45 +1166,54 @@ def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn,
     else:
         print(",".join(header))
         print(",".join(data_str))
-                
-if __name__=="__main__":
-    comm = MPI.COMM_WORLD
-    
-    with Parla():
-        # TPS solver
-        profile_tt[pp.TPS_SETUP].start()
-        
-        tps = libtps.Tps(comm)
-        tps.parseCommandLineArgs(sys.argv)
-        tps.parseInput()
-        tps.chooseDevices()
-        tps.chooseSolver()
-        tps.initialize()
-        
-        profile_tt[pp.TPS_SETUP].stop()
 
-        boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
-        interface = libtps.Tps2Boltzmann(tps)
-        tps.initInterface(interface)
 
-        #coords = np.array(interface.HostReadSpatialCoordinates(), copy=False)
-        tps.solveBegin()
-        tps.push(interface)
-        
-        profile_tt[pp.BTE_SETUP].start()
-        boltzmann.grid_setup(interface)
-        profile_tt[pp.BTE_SETUP].stop()
-        
+def driver_w_parla(comm):
+    
+    rank = comm.Get_rank()
+    npes = comm.Get_size()
+    
+    dev  = rank % 3
+    
+    with Parla():
         @spawn(placement=cpu, vcus=0)
         async def __main__():
             
-            bte_use_interp = True
-            await boltzmann.solve_init(bte_use_interp)
-            xp = boltzmann.bte_solver.xp_module
-
-            max_iters = boltzmann.param.tps_bte_max_iter
+            # TPS solver
+            profile_tt[pp.TPS_SETUP].start()
+            tps = libtps.Tps(comm)
+            tps.parseCommandLineArgs(sys.argv)
+            tps.parseInput()
+            tps.chooseDevices()
+            tps.chooseSolver()
+            tps.initialize()
+            profile_tt[pp.TPS_SETUP].stop()
+            
+            interface = libtps.Tps2Boltzmann(tps)
+            tps.initInterface(interface)
+            tps.solveBegin()
+            # --- first TPS step is needed to initialize the EM fields
+            tps.solveStep()
+            tps.push(interface)
+            
+            # with cp.cuda.Device(dev):
+            #     cp.cuda.runtime.deviceSynchronize()
+            # comm.Barrier()
+            
+            boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
+            rank      = boltzmann.comm.Get_rank()
+            npes      = boltzmann.comm.Get_size()
+            
+            profile_tt[pp.BTE_SETUP].start()
+            boltzmann.grid_setup(interface)
+            profile_tt[pp.BTE_SETUP].stop()
+            
+            # await boltzmann.solve_init_async()
+            # xp        = boltzmann.bte_solver.xp_module
+            xp        = cp
+            max_iters = 1#boltzmann.param.tps_bte_max_iter
             iter      = 0
-            tt        = 0#interface.currentTime()
+            tt        = 0
             tau       = (1/boltzmann.param.Efreq)
             dt_tps    = interface.timeStep()
             dt_bte    = boltzmann.param.dt * tau 
@@ -1065,26 +1229,30 @@ async def __main__():
             bte_max_cycles = int(boltzmann.param.cycles)
             tps_max_cycles = boltzmann.param.bte_solve_freq
             
-            print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle)
-            tps.solveStep()
-            tps.push(interface)
-            p_t1 = 0 
-            p_t2 = 0
+            if (boltzmann.rankG==0):
+                print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle, flush=True)
+                
             while (iter<max_iters):
                 
                 if (iter%cycle_freq==0):
                     interface.saveDataCollection(cycle=(iter//cycle_freq), time=iter)
                 
-                ########################## BTE solve ##################################################
+                # ########################## BTE solve ##################################################
                 profile_tt[pp.BTE_FETCH].start()
-                await boltzmann.fetch(interface, use_interp=bte_use_interp)
+                #await boltzmann.fetch_asnyc(interface)
+                boltzmann.fetch(interface)
                 profile_tt[pp.BTE_FETCH].stop()
                 
+                """
                 if (boltzmann.param.solver_type=="steady-state"):
+                    
                     profile_tt[pp.BTE_SOLVE].start()
-                    await boltzmann.solve()
+                    await boltzmann.solve_async()
                     profile_tt[pp.BTE_SOLVE].stop()
                     
+                    with cp.cuda.Device(dev):
+                        cp.cuda.runtime.deviceSynchronize()
+                    comm.Barrier()
                     
                     profile_tt[pp.BTE_PUSH].start()
                     ts = TaskSpace("T")
@@ -1093,7 +1261,7 @@ async def __main__():
                         def t1():
                             boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", boltzmann.ff[grid_idx])
                     await ts
-                    await boltzmann.push(interface, use_interp=bte_use_interp)
+                    await boltzmann.push_async(interface)
                     profile_tt[pp.BTE_PUSH].stop()
                     
                     if boltzmann.param.export_csv ==1:
@@ -1101,12 +1269,9 @@ def t1():
                             dev_id  = gidx_to_device_map(grid_idx,n_grids)
                             with cp.cuda.Device(dev_id):
                                 u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d_rank_%d_npes_%d"%(grid_idx, rank, npes))
                 else:
                     assert boltzmann.param.solver_type == "transient", "unknown BTE solver type"
-                    """
-                    transient BTE solver (evolve until time-periodic solutions)
-                    """
                     
                     tt_bte       = 0
                     bte_u        = [0 for i in range(n_grids)]
@@ -1138,7 +1303,7 @@ def t1():
                                     
                             await ts
                             p_t3 = min_mean_max(profile_tt[pp.BTE_SOLVE].snap, comm)
-                            print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]))
+                            print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]), flush=True)
                             
                             if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
                                 break
@@ -1157,7 +1322,7 @@ def t1():
                         await ts
                         
                         profile_tt[pp.BTE_SOLVE].start()
-                        await boltzmann.solve_step(tt_bte, dt_bte)
+                        await boltzmann.solve_step_async(tt_bte, dt_bte)
                         profile_tt[pp.BTE_SOLVE].stop()
                         
                         if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
@@ -1184,7 +1349,7 @@ def t1():
                             u_avg[grid_idx]  = xp.dot(qA, u_avg[grid_idx])
                             boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
                     await ts
-                    await boltzmann.push(interface, use_interp=bte_use_interp)
+                    await boltzmann.push_async(interface)
                     profile_tt[pp.BTE_PUSH].stop()
                     
                     if boltzmann.param.export_csv ==1:
@@ -1192,10 +1357,14 @@ def t1():
                             dev_id  = gidx_to_device_map(grid_idx,n_grids)
                             with cp.cuda.Device(dev_id):
                                 u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
-                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d"%(grid_idx))
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d_rank_%d_npes_%d"%(grid_idx, rank, npes))
+                
+                with cp.cuda.Device(dev):
+                    cp.cuda.runtime.deviceSynchronize()
+                comm.Barrier()
+                """
+                
                 
-                    
-                    
                 ################### tps solve ######################################
                 profile_tt[pp.TPS_FETCH].start()
                 tps.fetch(interface)
@@ -1204,9 +1373,6 @@ def t1():
                 tps_u  = 0
                 tps_v  = 0
                 tt_tps = 0
-                
-                p_t1   = 0
-                p_t2   = 0
                 for tps_idx in range(tps_sper_cycle * tps_max_cycles + 1):
                     if (tps_idx % tps_sper_cycle == 0):
                         tps.push(interface)
@@ -1222,7 +1388,7 @@ def t1():
                         tps_v               = np.copy(tps_u)
                         
                         p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
-                        print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]))
+                        print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]), flush=True)
                         # if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
                         #     break
                     
@@ -1234,7 +1400,7 @@ def t1():
                     profile_tt[pp.TPS_SOLVE].stop()
                     if(terminal_output_freq > 0 and tps_idx % terminal_output_freq ==0):
                         p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
-                        print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps, p_t3[0],p_t3[1],p_t3[2]))
+                        print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps, p_t3[0],p_t3[1],p_t3[2]), flush=True)
                     tt_tps +=dt_tps
                 
                 profile_tt[pp.TPS_PUSH].start()
@@ -1243,7 +1409,16 @@ def t1():
                 
                 tt += dt_tps * tps_idx
                 iter+=1
-        
-            profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
-    tps.solveEnd()
-    sys.exit (tps.getStatus())
\ No newline at end of file
+                comm.Barrier()
+                
+            #profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
+            tps.solveEnd()
+            comm.Barrier()
+            return tps.getStatus()
+
+if __name__=="__main__":
+    comm = MPI.COMM_WORLD
+    driver_w_parla(comm)
+    
+            
+            
\ No newline at end of file

From 6d0a2cb3d8073f7699d9f2187b97dde7eee7c4c7 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Sat, 16 Mar 2024 17:03:16 -0500
Subject: [PATCH 51/75] tps bte minimum parla crash example

---
 src/tps-bte_0d3v.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 0972275a7..34fd0e6fd 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -13,6 +13,7 @@
 import pandas as pd
 import scipy.interpolate
 import scipy.cluster
+import threading
 
 class profile_t:
     def __init__(self,name):
@@ -463,6 +464,7 @@ def normalize(obs):
                 std_obs[std_obs == 0.0] = 1.0
                 return obs/std_obs, std_obs
 
+            thread_pool = list()
             for grid_idx in self.active_grid_idx:
                 def t1():
                     dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
@@ -539,9 +541,16 @@ def t1():
                     # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
                     
                     return
-                t1()
+                
+                thread = threading.Thread(target=t1)
+                thread_pool.append(thread)
+                thread.start()
+            
+            for thread in thread_pool:
+                thread.join()
             
         else:
+            thread_pool = list()
             for grid_idx in self.active_grid_idx:
                 def t1():
                     bte_idx           = gidx_to_pidx[grid_idx]
@@ -605,10 +614,25 @@ def t1():
                     # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
                     
                     return
-                t1()
+                
+                thread = threading.Thread(target=t1)
+                thread_pool.append(thread)
+                thread.start()
+            
+            for thread in thread_pool:
+                thread.join()
         return        
 
     async def fetch_asnyc(self, interface):
+        ts = TaskSpace("T")
+        for grid_idx in self.active_grid_idx:
+            @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+            def t1():
+                print("rank [%d/%d] hello from parla task %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                return
+        await ts
+        return
+        
         use_interp              = self.param.use_clstr_inp
         gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
         heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
@@ -1239,8 +1263,8 @@ async def __main__():
                 
                 # ########################## BTE solve ##################################################
                 profile_tt[pp.BTE_FETCH].start()
-                #await boltzmann.fetch_asnyc(interface)
-                boltzmann.fetch(interface)
+                await boltzmann.fetch_asnyc(interface)
+                #boltzmann.fetch(interface)
                 profile_tt[pp.BTE_FETCH].stop()
                 
                 """

From c7aedb0372afeaac09b8eb56e876efb0179bb75b Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Mon, 18 Mar 2024 10:57:46 -0500
Subject: [PATCH 52/75] tps-bte with Parla multi-gpu version [working]

---
 src/tps-bte_0d3v.py | 332 ++++++++++++++++++++++++++++++--------------
 1 file changed, 224 insertions(+), 108 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 34fd0e6fd..8940abaf4 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -146,6 +146,51 @@ class TPSINDEX():
     EX3_IDX  = 2
     
     MOLE_FRAC_IDX = {0: NEU_IDX, 1: EX1_IDX , 2: EX2_IDX , 3: EX3_IDX} 
+
+def k_means(x, num_clusters, xi=None, max_iter=1000, thresh=1e-12, rand_seed=0, xp=np):
+    assert x.ndim == 2, "observations must me 2d array"
+    if xi is None:
+        xp.random.seed(rand_seed)
+        xi = x[xp.random.choice(x.shape[0], num_clusters, replace=False)]
+
+    distortion_0 = xp.zeros(num_clusters)
+    idx          = xp.arange(num_clusters)
+    for iter in range(max_iter):
+        distance     = xp.linalg.norm(x[:, None, :] - xi[None, :, :], axis=2)
+        pred         = xp.argmin(distance, axis=1)
+        mask         = pred == idx[:, None]
+        # print(mask)
+        # print(xp.where(mask[:, :, None], x, 0))
+        # print(xp.where(mask[:, :, None], x, 0).sum(axis=1))
+        # print(xp.where(mask[:, :, None], x, 0).shape)
+        sums         = xp.where(mask[:, :, None], x, 0).sum(axis=1)
+        counts       = xp.count_nonzero(mask, axis=1)
+        counts[counts==0] = 1
+        # print(distance)
+        # print(mask)
+        # print(xp.where(mask.T, distance, 0))
+        # print(xp.where(mask.T, distance, 0).sum(axis=0).shape)
+        # print(counts)
+        distortion_1 = xp.where(mask.T, distance, 0).sum(axis=0)/counts[:,None]
+        rel_error    = xp.linalg.norm(distortion_0-distortion_1)/xp.linalg.norm(distortion_1)
+        #print(iter, rel_error, distortion_1, xi)
+        #print(iter, rel_error, xi[0:3])
+        if rel_error < thresh:
+            break
+        
+        xi_new       = sums / counts[:,None]
+        distortion_0 = distortion_1
+        # rel_error = xp.max(xp.linalg.norm(xi_new-xi, axis=1)/xp.linalg.norm(xi, axis=1))
+        # if rel_error < thresh:
+        #     # print(iter, rel_error)
+        #     # print("xi_new", xi_new[0:10,:], "xi", xi[0:10,:])
+        #     break
+        
+        xi = xi_new
+        
+        
+    return xi, pred
+    
     
 class Boltzmann0D2VBactchedSolver:
     
@@ -161,9 +206,11 @@ def __init__(self, tps, comm):
         self.__parse_config_file__(sys.argv[2])
         self.xp_module          = np
         boltzmann_dir           = self.param.output_dir
-        isExist = os.path.exists(boltzmann_dir)
-        if not isExist:
-           os.makedirs(boltzmann_dir)
+        
+        if (self.rankG==0):
+            isExist = os.path.exists(boltzmann_dir)
+            if not isExist:
+                os.makedirs(boltzmann_dir)
            
         num_gpus_per_node = 1 
         if self.param.use_gpu==1:
@@ -623,16 +670,100 @@ def t1():
                 thread.join()
         return        
 
-    async def fetch_asnyc(self, interface):
-        ts = TaskSpace("T")
-        for grid_idx in self.active_grid_idx:
-            @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+    def solve_init(self):
+        rank                    = self.comm.Get_rank()
+        npes                    = self.comm.Get_size()
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        
+        thread_pool = list()
+        for grid_idx in range(self.param.n_grids):
             def t1():
-                print("rank [%d/%d] hello from parla task %d"%(self.rankG, self.npesG, grid_idx), flush=True)
-                return
-        await ts
-        return
+                dev_id = gidx_to_device_map(grid_idx, n_grids)
+                print("rank [%d/%d] setting grid %d to device %d"%(rank, npes, grid_idx, dev_id), flush=True)
+                with cp.cuda.Device(dev_id):
+                    s1 = cp.cuda.Stream(non_blocking=True)
+                    with s1:
+                        self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+                    s1.synchronize()
+        
+                thread = threading.Thread(target=t1)
+                thread_pool.append(thread)
+                thread.start()
         
+        for thread in thread_pool:
+            thread.join()
+        
+        def ts_op_setup(grid_idx):
+            xp                                      = self.xp_module 
+            f_mw                                    = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+            n_pts                                   = f_mw.shape[1]
+            Qmat                                    = self.bte_solver._op_qmat[grid_idx]
+            INr                                     = xp.eye(Qmat.shape[1])
+            self.bte_solver._op_imat_vx[grid_idx]   = xp.einsum("i,jk->ijk",xp.ones(n_pts), INr)
+            
+            if self.param.use_clstr_inp==True:
+                assert n_pts == self.param.n_sub_clusters
+            
+        if(self.param.use_gpu==1):
+            self.xp_module = cp
+            
+            thread_pool = list()
+            for grid_idx in self.active_grid_idx:
+                dev_id = gidx_to_device_map(grid_idx, n_grids)
+                def t1():
+                    with cp.cuda.Device(dev_id):
+                        s1 = cp.cuda.Stream(non_blocking=True)
+                        with s1:
+                            ts_op_setup(grid_idx)
+                            vth          = self.bte_solver._par_vth[grid_idx]
+                            qA           = self.bte_solver._op_diag_dg[grid_idx]
+                            mw           = bte_utils.get_maxwellian_3d(vth, 1)
+                            mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
+                            f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                            f_mw         = f_mw/cp.dot(mm_op, f_mw)
+                            f_mw         = cp.dot(qA.T, f_mw)
+                            self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
+                        
+                        s1.synchronize()
+                
+                thread = threading.Thread(target=t1)
+                thread_pool.append(thread)
+                thread.start()
+                
+            for thread in thread_pool:
+                thread.join()
+            
+        else:
+            self.xp_module = np
+            thread_pool = list()
+            
+            for grid_idx in self.active_grid_idx:
+                def t1():
+                    ts_op_setup(grid_idx)
+                    
+                    vth          = self.bte_solver._par_vth[grid_idx]
+                    qA           = self.bte_solver._op_diag_dg[grid_idx]
+                    mw           = bte_utils.get_maxwellian_3d(vth, 1)
+                    mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
+                    f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    f_mw         = f_mw/np.dot(mm_op, f_mw)
+                    f_mw         = np.dot(qA.T, f_mw)
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw))
+                
+                thread = threading.Thread(target=t1)
+                thread_pool.append(thread)
+                thread.start()
+            
+            for thread in thread_pool:
+                thread.join()
+                    
+            
+        
+        return
+    
+    async def fetch_asnyc(self, interface):
+        xp                      = self.xp_module
         use_interp              = self.param.use_clstr_inp
         gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
         heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
@@ -667,70 +798,70 @@ def t1():
         ns_by_n0[ns_by_n0<=0]   = 0
         m_bte                   = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
         
-        self.sub_cluster_idx_to_pidx = None
-        self.sub_cluster_c           = None          
-        gidx_to_device_map           = self.gidx_to_device_map
+        self.m_bte              = m_bte
+        self.sub_cluster_c      = None
+        self.sub_cluster_c_lbl  = None
+        gidx_to_device_map      = self.gidx_to_device_map
         
         if (use_interp == True):
             n_sub_clusters               = self.param.n_sub_clusters
-            self.sub_cluster_idx_to_pidx = [[None for i in range(n_sub_clusters)] for i in range(self.param.n_grids)]
             self.sub_cluster_c           = [None for i in range(self.param.n_grids)]
+            self.sub_cluster_c_lbl       = [None for i in range(self.param.n_grids)]
             
-            def normalize(obs):
-                std_obs   = np.std(obs, axis=0)
+            def normalize(obs, xp):
+                std_obs   = xp.std(obs, axis=0)
                 std_obs[std_obs == 0.0] = 1.0
                 return obs/std_obs, std_obs
 
             ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
-                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                dev_id    = self.gidx_to_device_map(grid_idx, n_grids)
+                @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
                 def t1():
-                    dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
-                    m                            = m_bte[gidx_to_pidx[grid_idx]]
-                    mw , mw_std                  = normalize(m)
+                    # xp                           = cp
+                    # m                            = xp.array(m_bte[gidx_to_pidx[grid_idx]])
+                    # mw , mw_std                  = normalize(m, xp)
+                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=1000, thresh=1e-8, rand_seed=self.param.rand_seed, xp=xp)
                     
                     # to repoduce clusters
+                    xp                           = np
                     np.random.seed(self.param.rand_seed)
-                    
+                    m                            = m_bte[gidx_to_pidx[grid_idx]]
+                    mw , mw_std                  = normalize(m, xp)
                     mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
                     mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0
+                    dist_mat                     = xp.linalg.norm(mw[:, None, :] - mcw[None, : , :], axis=2)
+                    membership_m                 = xp.argmin(dist_mat, axis=1)
                     
-                    assert mcw.shape[0] == self.param.n_sub_clusters
-                    
-                    mc                           = mcw * mw_std
-                    dist_mat                     = np.array([np.linalg.norm(mw - mcw[i], axis=1) for i in range(n_sub_clusters)]).T
-                    membership_m                 = np.argmin(dist_mat, axis=1)
-                    self.sub_cluster_c[grid_idx] = mc
-                    
-                    for c_idx in range(n_sub_clusters):
-                        self.sub_cluster_idx_to_pidx[grid_idx][c_idx] = np.argwhere(membership_m==c_idx)[:,0]
-                    
+                    assert mcw.shape[0]               == self.param.n_sub_clusters
+                    mc                                = mcw * mw_std
+                    self.sub_cluster_c[grid_idx]      = mc
+                    self.sub_cluster_c_lbl[grid_idx]  = membership_m
                     
-                        
-                    n0       = np.ones(mc.shape[0]) * self.param.n0
+                    n0       = xp.ones(mc.shape[0]) * self.param.n0
                     Ex       = mc[: , 0] * self.param.n0 * self.param.Td_fac
                     Ey       = mc[: , 1] * self.param.n0 * self.param.Td_fac
                     Tg       = mc[: , 2]
                     ne       = mc[: , 3] * self.param.n0
                     ni       = mc[: , 3] * self.param.n0
-                    ns_by_n0 = np.transpose(mc[: , 4:])
-                    EMag     = np.sqrt(Ex**2 + Ey**2)
+                    ns_by_n0 = xp.transpose(mc[: , 4:])
+                    EMag     = xp.sqrt(Ex**2 + Ey**2)
                     
                     if self.param.verbose == 1 :
                         print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
                         print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
                         print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
                         
-                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)       , np.max(n0))   , flush=True)
-                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN)    , np.max(ExbyN)), flush=True)
-                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN)    , np.max(EybyN)), flush=True)
-                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)       , np.max(Tg))   , flush=True)
-                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)       , np.max(ne))   , flush=True)
+                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(xp.min(n0)       , xp.max(n0))   , flush=True)
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(xp.min(ExbyN)    , xp.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(xp.min(EybyN)    , xp.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(xp.min(Tg)       , xp.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(xp.min(ne)       , xp.max(ne))   , flush=True)
                         
                         for i in range(ns_by_n0.shape[0]):
-                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])), flush=True)
+                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
                                             
                     if (use_gpu==1):
                         with cp.cuda.Device(dev_id):
@@ -769,48 +900,46 @@ def t1():
         else:
             ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
-                @spawn(ts[grid_idx], placement=[cpu], vcus=0.0)
+                dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+                @spawn(ts[grid_idx], placement=[gpu(dev_id)], vcus=0.0)
                 def t1():
                     bte_idx           = gidx_to_pidx[grid_idx]
-                    dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
-                    
-                    mc                = m_bte[bte_idx]
+                    mc                = xp.array(m_bte[bte_idx])
                     
-                    n0                = np.ones(mc.shape[0]) * self.param.n0
+                    n0                = xp.ones(mc.shape[0]) * self.param.n0
                     Ex                = mc[: , 0] * self.param.n0 * self.param.Td_fac
                     Ey                = mc[: , 1] * self.param.n0 * self.param.Td_fac
                     
                     Tg                = mc[: , 2]
                     ne                = mc[: , 3] * self.param.n0
                     ni                = mc[: , 3] * self.param.n0
-                    ns_by_n0          = np.transpose(mc[: , 4:])
-                    
-                    EMag              = np.sqrt(Ex**2 + Ey**2)
+                    ns_by_n0          = xp.transpose(mc[: , 4:])
+                    EMag              = xp.sqrt(Ex**2 + Ey**2)
                 
                     if self.param.verbose == 1 :
                         print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
                         print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
                         print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
                         
-                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)), flush=True)
-                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)), flush=True)
-                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg))   , flush=True)
-                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne))   , flush=True)
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(xp.min(ExbyN), xp.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(xp.min(EybyN), xp.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(xp.min(Tg)   , xp.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(xp.min(ne)   , xp.max(ne))   , flush=True)
                         
                         for i in range(ns_by_n0.shape[0]):
-                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])), flush=True)
+                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
             
             
-                    if (use_gpu == 1):
-                        with cp.cuda.Device(dev_id):
-                            n0   = cp.array(n0) 
-                            ne   = cp.array(ne)
-                            ni   = cp.array(ni)
-                            Ex   = cp.array(Ex)
-                            Ey   = cp.array(Ey)
-                            Tg   = cp.array(Tg)
-                            EMag = cp.sqrt(Ex**2 + Ey**2)
-                            ns_by_n0 = cp.array(ns_by_n0)
+                    # if (use_gpu == 1):
+                    #     with cp.cuda.Device(dev_id):
+                    #         n0   = cp.array(n0) 
+                    #         ne   = cp.array(ne)
+                    #         ni   = cp.array(ni)
+                    #         Ex   = cp.array(Ex)
+                    #         Ey   = cp.array(Ey)
+                    #         Tg   = cp.array(Tg)
+                    #         EMag = cp.sqrt(Ex**2 + Ey**2)
+                    #         ns_by_n0 = cp.array(ns_by_n0)
                     
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
@@ -1005,14 +1134,17 @@ def t1():
                         h_curr    = self.bte_solver.normalized_distribution(grid_idx, u0)
                         qoi       = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
                         rr_cpu    = xp.asnumpy(qoi["rates"])
+                        inp_mask  = xp.asnumpy(self.sub_cluster_c_lbl[grid_idx]) == np.arange(self.param.n_sub_clusters)[:, None]
                         
-                        rr_interp = np.zeros(len(gidx_to_pidx_map[grid_idx]))
-                        for r_idx in range(n_reactions):
-                            for c_idx in range(self.param.n_sub_clusters):
-                                rr_interp[self.sub_cluster_idx_to_pidx[grid_idx][c_idx]] = rr_cpu[TPSINDEX.RR_IDX[r_idx]][c_idx] * self.param.N_Avo
-                        
-                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_interp
+                        rr_interp = np.zeros((n_reactions, len(gidx_to_pidx_map[grid_idx])))
                         
+                        for c_idx in range(self.param.n_sub_clusters):
+                            inp_idx = inp_mask[c_idx]
+                            for r_idx in range(n_reactions):
+                                rr_interp[r_idx, inp_idx] = rr_cpu[TPSINDEX.RR_IDX[r_idx]][c_idx] * self.param.N_Avo
+
+                        for r_idx in range(n_reactions):                            
+                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_interp[r_idx, :]
                 await ts
                 rates = rates.reshape((-1))
                 rates[rates<0] = 0.0
@@ -1151,6 +1283,9 @@ class pp(enum.IntEnum):
 
 def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn, fname, comm):
     
+    rank = comm.Get_rank()
+    npes = comm.Get_size()
+    
     Nx = boltzmann.param.n_grids * boltzmann.param.n_sub_clusters
     Nv = (boltzmann.param.Nr + 1) * (boltzmann.param.l_max + 1)
     
@@ -1182,14 +1317,16 @@ def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn,
                 tt[pp.TPS_PUSH][0] , tt[pp.TPS_PUSH][1] , tt[pp.TPS_PUSH][2] ]
     
     data_str= ["%.4E"%d for d in data]
-    if fname!="":
-        with open(fname, "a") as f:
-            f.write(",".join(header)+"\n")
-            f.write(",".join(data_str)+"\n")
-            f.close()
-    else:
-        print(",".join(header))
-        print(",".join(data_str))
+    
+    if rank ==0 :
+        if fname!="":
+            with open(fname, "a") as f:
+                f.write(",".join(header)+"\n")
+                f.write(",".join(data_str)+"\n")
+                f.close()
+        else:
+            print(",".join(header))
+            print(",".join(data_str))
 
 
 def driver_w_parla(comm):
@@ -1197,12 +1334,10 @@ def driver_w_parla(comm):
     rank = comm.Get_rank()
     npes = comm.Get_size()
     
-    dev  = rank % 3
-    
     with Parla():
-        @spawn(placement=cpu, vcus=0)
+        dev_id = rank % len(gpu)
+        @spawn(placement=[gpu(dev_id)], vcus=0)
         async def __main__():
-            
             # TPS solver
             profile_tt[pp.TPS_SETUP].start()
             tps = libtps.Tps(comm)
@@ -1220,10 +1355,6 @@ async def __main__():
             tps.solveStep()
             tps.push(interface)
             
-            # with cp.cuda.Device(dev):
-            #     cp.cuda.runtime.deviceSynchronize()
-            # comm.Barrier()
-            
             boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
             rank      = boltzmann.comm.Get_rank()
             npes      = boltzmann.comm.Get_size()
@@ -1232,10 +1363,9 @@ async def __main__():
             boltzmann.grid_setup(interface)
             profile_tt[pp.BTE_SETUP].stop()
             
-            # await boltzmann.solve_init_async()
-            # xp        = boltzmann.bte_solver.xp_module
-            xp        = cp
-            max_iters = 1#boltzmann.param.tps_bte_max_iter
+            await boltzmann.solve_init_async()
+            xp        = boltzmann.bte_solver.xp_module
+            max_iters = boltzmann.param.tps_bte_max_iter
             iter      = 0
             tt        = 0
             tau       = (1/boltzmann.param.Efreq)
@@ -1257,27 +1387,20 @@ async def __main__():
                 print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle, flush=True)
                 
             while (iter<max_iters):
-                
                 if (iter%cycle_freq==0):
                     interface.saveDataCollection(cycle=(iter//cycle_freq), time=iter)
                 
                 # ########################## BTE solve ##################################################
                 profile_tt[pp.BTE_FETCH].start()
                 await boltzmann.fetch_asnyc(interface)
-                #boltzmann.fetch(interface)
                 profile_tt[pp.BTE_FETCH].stop()
                 
-                """
                 if (boltzmann.param.solver_type=="steady-state"):
                     
                     profile_tt[pp.BTE_SOLVE].start()
                     await boltzmann.solve_async()
                     profile_tt[pp.BTE_SOLVE].stop()
                     
-                    with cp.cuda.Device(dev):
-                        cp.cuda.runtime.deviceSynchronize()
-                    comm.Barrier()
-                    
                     profile_tt[pp.BTE_PUSH].start()
                     ts = TaskSpace("T")
                     for grid_idx in boltzmann.active_grid_idx:
@@ -1383,12 +1506,6 @@ def t1():
                                 u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
                                 boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d_rank_%d_npes_%d"%(grid_idx, rank, npes))
                 
-                with cp.cuda.Device(dev):
-                    cp.cuda.runtime.deviceSynchronize()
-                comm.Barrier()
-                """
-                
-                
                 ################### tps solve ######################################
                 profile_tt[pp.TPS_FETCH].start()
                 tps.fetch(interface)
@@ -1433,9 +1550,8 @@ def t1():
                 
                 tt += dt_tps * tps_idx
                 iter+=1
-                comm.Barrier()
-                
-            #profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
+            
+            profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
             tps.solveEnd()
             comm.Barrier()
             return tps.getStatus()
@@ -1445,4 +1561,4 @@ def t1():
     driver_w_parla(comm)
     
             
-            
\ No newline at end of file
+            

From f4624ebb4a89c9a0ff32dbde354e15245e1ab033 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Mon, 18 Mar 2024 12:12:59 -0500
Subject: [PATCH 53/75] single stream no parla + mpi bte solver added

---
 src/tps-bte_0d3v.py | 597 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 471 insertions(+), 126 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 8940abaf4..123763d6b 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -465,6 +465,7 @@ def asnumpy(a):
         return
     
     def fetch(self, interface):
+        xp                      = self.xp_module
         use_interp              = self.param.use_clstr_inp
         gidx_to_pidx            = self.grid_idx_to_spatial_idx_map
         heavy_temp              = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
@@ -474,88 +475,94 @@ def fetch(self, interface):
         electron_temp           = np.array(interface.HostRead(libtps.t2bIndex.ElectronTemperature), copy=False)
         efield                  = np.array(interface.HostRead(libtps.t2bIndex.ElectricField), copy=False).reshape((2, tps_npts))
         species_densities       = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
+        
         cs_avail_species        = self.bte_solver._avail_species
+        
         n0                      = np.sum(species_densities, axis=0) - species_densities[TPSINDEX.ELE_IDX]
         ns_by_n0                = np.concatenate([species_densities[TPSINDEX.MOLE_FRAC_IDX[i]]/n0 for i in range(len(cs_avail_species))]).reshape((len(cs_avail_species), tps_npts))
         
-        n_grids            = self.param.n_grids 
-        use_gpu            = self.param.use_gpu
+        n_grids                 = self.param.n_grids 
+        use_gpu                 = self.param.use_gpu
         
-        Tg                 = heavy_temp
+        Tg                      = heavy_temp
         
-        Ex                 = efield[0]
-        Ey                 = efield[1]
+        Ex                      = efield[0]
+        Ey                      = efield[1]
     
-        ExbyN              = Ex/n0/self.param.Td_fac
-        EybyN              = Ey/n0/self.param.Td_fac
+        ExbyN                   = Ex/n0/self.param.Td_fac
+        EybyN                   = Ey/n0/self.param.Td_fac
         
-        Ex                 = ExbyN * self.param.n0 * self.param.Td_fac
-        Ey                 = EybyN * self.param.n0 * self.param.Td_fac
+        Ex                      = ExbyN * self.param.n0 * self.param.Td_fac
+        Ey                      = EybyN * self.param.n0 * self.param.Td_fac
         
-        ion_deg              = species_densities[TPSINDEX.ELE_IDX]/n0
-        ion_deg[ion_deg<=0]  = 1e-16
-        ns_by_n0[ns_by_n0<=0]= 0
-        m_bte                = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
+        ion_deg                 = species_densities[TPSINDEX.ELE_IDX]/n0
+        ion_deg[ion_deg<=0]     = 1e-16
+        ns_by_n0[ns_by_n0<=0]   = 0
+        m_bte                   = np.concatenate([ExbyN.reshape((-1, 1)), EybyN.reshape((-1, 1)), Tg.reshape((-1, 1)), ion_deg.reshape((-1, 1))] + [ ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])], axis=1)
         
-        self.sub_cluster_idx_to_pidx = None
-        self.sub_cluster_c           = None          
-        gidx_to_device_map           = self.gidx_to_device_map
+        self.m_bte              = m_bte
+        self.sub_cluster_c      = None
+        self.sub_cluster_c_lbl  = None
+        gidx_to_device_map      = self.gidx_to_device_map
         
         if (use_interp == True):
             n_sub_clusters               = self.param.n_sub_clusters
-            self.sub_cluster_idx_to_pidx = [[None for i in range(n_sub_clusters)] for i in range(self.param.n_grids)]
             self.sub_cluster_c           = [None for i in range(self.param.n_grids)]
+            self.sub_cluster_c_lbl       = [None for i in range(self.param.n_grids)]
             
-            def normalize(obs):
-                std_obs   = np.std(obs, axis=0)
+            def normalize(obs, xp):
+                std_obs   = xp.std(obs, axis=0)
                 std_obs[std_obs == 0.0] = 1.0
                 return obs/std_obs, std_obs
 
-            thread_pool = list()
             for grid_idx in self.active_grid_idx:
+                dev_id    = self.gidx_to_device_map(grid_idx, n_grids)
+                
                 def t1():
-                    dev_id                       = self.gidx_to_device_map(grid_idx, n_grids)
-                    m                            = m_bte[gidx_to_pidx[grid_idx]]
-                    mw , mw_std                  = normalize(m)
+                    # xp                           = cp
+                    # m                            = xp.array(m_bte[gidx_to_pidx[grid_idx]])
+                    # mw , mw_std                  = normalize(m, xp)
+                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=1000, thresh=1e-8, rand_seed=self.param.rand_seed, xp=xp)
                     
                     # to repoduce clusters
+                    xp                           = np
                     np.random.seed(self.param.rand_seed)
-                    
+                    m                            = m_bte[gidx_to_pidx[grid_idx]]
+                    mw , mw_std                  = normalize(m, xp)
                     mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
                     mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0
+                    dist_mat                     = xp.linalg.norm(mw[:, None, :] - mcw[None, : , :], axis=2)
+                    membership_m                 = xp.argmin(dist_mat, axis=1)
                     
-                    mc                           = mcw * mw_std
-                    dist_mat                     = np.array([np.linalg.norm(mw - mcw[i], axis=1) for i in range(n_sub_clusters)]).T
-                    membership_m                 = np.argmin(dist_mat, axis=1)
-                    self.sub_cluster_c[grid_idx] = mc
+                    assert mcw.shape[0]               == self.param.n_sub_clusters
+                    mc                                = mcw * mw_std
+                    self.sub_cluster_c[grid_idx]      = mc
+                    self.sub_cluster_c_lbl[grid_idx]  = membership_m
                     
-                    for c_idx in range(n_sub_clusters):
-                        self.sub_cluster_idx_to_pidx[grid_idx][c_idx] = np.argwhere(membership_m==c_idx)[:,0]
-                        
-                    n0       = np.ones(mc.shape[0]) * self.param.n0
+                    n0       = xp.ones(mc.shape[0]) * self.param.n0
                     Ex       = mc[: , 0] * self.param.n0 * self.param.Td_fac
                     Ey       = mc[: , 1] * self.param.n0 * self.param.Td_fac
                     Tg       = mc[: , 2]
                     ne       = mc[: , 3] * self.param.n0
                     ni       = mc[: , 3] * self.param.n0
-                    ns_by_n0 = np.transpose(mc[: , 4:])
-                    EMag     = np.sqrt(Ex**2 + Ey**2)
+                    ns_by_n0 = xp.transpose(mc[: , 4:])
+                    EMag     = xp.sqrt(Ex**2 + Ey**2)
                     
                     if self.param.verbose == 1 :
-                        print("rank = %d Boltzmann solver inputs for v-space grid id %d"%(self.rankG, grid_idx))
-                        print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                        print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
                         
-                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(np.min(n0)       , np.max(n0)))
-                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN)    , np.max(ExbyN)))
-                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN)    , np.max(EybyN)))
-                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)       , np.max(Tg)))
-                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)       , np.max(ne)))
+                        print("n0    (min)               = %.12E [1/m^3]      \t n0   (max) = %.12E [1/m^3] "%(xp.min(n0)       , xp.max(n0))   , flush=True)
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(xp.min(ExbyN)    , xp.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(xp.min(EybyN)    , xp.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(xp.min(Tg)       , xp.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(xp.min(ne)       , xp.max(ne))   , flush=True)
                         
                         for i in range(ns_by_n0.shape[0]):
-                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])))
+                            print("[%d] ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(i, xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
                                             
                     if (use_gpu==1):
                         with cp.cuda.Device(dev_id):
@@ -589,57 +596,52 @@ def t1():
                     
                     return
                 
-                thread = threading.Thread(target=t1)
-                thread_pool.append(thread)
-                thread.start()
-            
-            for thread in thread_pool:
-                thread.join()
+                with cp.cuda.Device(dev_id):
+                    t1()
             
         else:
-            thread_pool = list()
+            ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
+                dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
+                
                 def t1():
                     bte_idx           = gidx_to_pidx[grid_idx]
-                    dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
-                    
-                    mc                = m_bte[bte_idx]
+                    mc                = xp.array(m_bte[bte_idx])
                     
-                    n0                = np.ones(mc.shape[0]) * self.param.n0
+                    n0                = xp.ones(mc.shape[0]) * self.param.n0
                     Ex                = mc[: , 0] * self.param.n0 * self.param.Td_fac
                     Ey                = mc[: , 1] * self.param.n0 * self.param.Td_fac
                     
                     Tg                = mc[: , 2]
                     ne                = mc[: , 3] * self.param.n0
                     ni                = mc[: , 3] * self.param.n0
-                    ns_by_n0          = np.transpose(mc[: , 4:])
-                    
-                    EMag              = np.sqrt(Ex**2 + Ey**2)
+                    ns_by_n0          = xp.transpose(mc[: , 4:])
+                    EMag              = xp.sqrt(Ex**2 + Ey**2)
                 
                     if self.param.verbose == 1 :
-                        print("rank = %d Boltzmann solver inputs for v-space grid id %d"%(self.rankG, grid_idx))
-                        print("Efreq = %.4E [1/s]" %(self.param.Efreq))
-                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx])
+                        print("rank [%d/%d] Boltzmann solver inputs for v-space grid id %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                        print("Efreq = %.4E [1/s]" %(self.param.Efreq)      , flush=True)
+                        print("n_pts = %d" % self.grid_idx_to_npts[grid_idx], flush=True)
                         
-                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(np.min(ExbyN), np.max(ExbyN)))
-                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(np.min(EybyN), np.max(EybyN)))
-                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(np.min(Tg)   , np.max(Tg)))
-                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(np.min(ne)   , np.max(ne)))
+                        print("Ex/n0 (min)               = %.12E [Td]         \t Ex/n0(max) = %.12E [Td]    "%(xp.min(ExbyN), xp.max(ExbyN)), flush=True)
+                        print("Ey/n0 (min)               = %.12E [Td]         \t Ey/n0(max) = %.12E [Td]    "%(xp.min(EybyN), xp.max(EybyN)), flush=True)
+                        print("Tg    (min)               = %.12E [K]          \t Tg   (max) = %.12E [K]     "%(xp.min(Tg)   , xp.max(Tg))   , flush=True)
+                        print("ne    (min)               = %.12E [1/m^3]      \t ne   (max) = %.12E [1/m^3] "%(xp.min(ne)   , xp.max(ne))   , flush=True)
                         
                         for i in range(ns_by_n0.shape[0]):
-                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(np.min(ns_by_n0[i]) , np.max(ns_by_n0[i])))
+                            print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
             
             
-                    if (use_gpu == 1):
-                        with cp.cuda.Device(dev_id):
-                            n0   = cp.array(n0) 
-                            ne   = cp.array(ne)
-                            ni   = cp.array(ni)
-                            Ex   = cp.array(Ex)
-                            Ey   = cp.array(Ey)
-                            Tg   = cp.array(Tg)
-                            EMag = cp.sqrt(Ex**2 + Ey**2)
-                            ns_by_n0 = cp.array(ns_by_n0)
+                    # if (use_gpu == 1):
+                    #     with cp.cuda.Device(dev_id):
+                    #         n0   = cp.array(n0) 
+                    #         ne   = cp.array(ne)
+                    #         ni   = cp.array(ni)
+                    #         Ex   = cp.array(Ex)
+                    #         Ey   = cp.array(Ey)
+                    #         Tg   = cp.array(Tg)
+                    #         EMag = cp.sqrt(Ex**2 + Ey**2)
+                    #         ns_by_n0 = cp.array(ns_by_n0)
                     
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
@@ -661,13 +663,10 @@ def t1():
                     # cp.save(self.param.out_fname + "_E_%02d.npy"%(grid_idx)   , EMag  , grid_idx)
                     
                     return
-                
-                thread = threading.Thread(target=t1)
-                thread_pool.append(thread)
-                thread.start()
-            
-            for thread in thread_pool:
-                thread.join()
+
+                with cp.cuda.Device(dev_id):
+                    t1()
+                    
         return        
 
     def solve_init(self):
@@ -676,23 +675,13 @@ def solve_init(self):
         n_grids                 = self.param.n_grids
         gidx_to_device_map      = self.gidx_to_device_map
         
-        thread_pool = list()
         for grid_idx in range(self.param.n_grids):
             def t1():
                 dev_id = gidx_to_device_map(grid_idx, n_grids)
                 print("rank [%d/%d] setting grid %d to device %d"%(rank, npes, grid_idx, dev_id), flush=True)
-                with cp.cuda.Device(dev_id):
-                    s1 = cp.cuda.Stream(non_blocking=True)
-                    with s1:
-                        self.bte_solver.host_to_device_setup(dev_id, grid_idx)
-                    s1.synchronize()
-        
-                thread = threading.Thread(target=t1)
-                thread_pool.append(thread)
-                thread.start()
-        
-        for thread in thread_pool:
-            thread.join()
+                self.bte_solver.host_to_device_setup(dev_id, grid_idx)
+            
+            t1()
         
         def ts_op_setup(grid_idx):
             xp                                      = self.xp_module 
@@ -707,37 +696,26 @@ def ts_op_setup(grid_idx):
             
         if(self.param.use_gpu==1):
             self.xp_module = cp
-            
-            thread_pool = list()
             for grid_idx in self.active_grid_idx:
                 dev_id = gidx_to_device_map(grid_idx, n_grids)
                 def t1():
-                    with cp.cuda.Device(dev_id):
-                        s1 = cp.cuda.Stream(non_blocking=True)
-                        with s1:
-                            ts_op_setup(grid_idx)
-                            vth          = self.bte_solver._par_vth[grid_idx]
-                            qA           = self.bte_solver._op_diag_dg[grid_idx]
-                            mw           = bte_utils.get_maxwellian_3d(vth, 1)
-                            mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
-                            f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
-                            f_mw         = f_mw/cp.dot(mm_op, f_mw)
-                            f_mw         = cp.dot(qA.T, f_mw)
-                            self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
-                        
-                        s1.synchronize()
-                
-                thread = threading.Thread(target=t1)
-                thread_pool.append(thread)
-                thread.start()
-                
-            for thread in thread_pool:
-                thread.join()
+                    ts_op_setup(grid_idx)
+                    
+                    vth          = self.bte_solver._par_vth[grid_idx]
+                    qA           = self.bte_solver._op_diag_dg[grid_idx]
+                    mw           = bte_utils.get_maxwellian_3d(vth, 1)
+                    mm_op        = self.bte_solver._op_mass[grid_idx] * mw(0) * vth**3
+                    f_mw         = self.bte_solver.get_boltzmann_parameter(grid_idx, "f_mw")
+                    f_mw         = f_mw/cp.dot(mm_op, f_mw)
+                    f_mw         = cp.dot(qA.T, f_mw)
+                    
+                    self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", cp.copy(f_mw))
+            
+                with cp.cuda.Device(dev_id):
+                    t1()
             
         else:
             self.xp_module = np
-            thread_pool = list()
-            
             for grid_idx in self.active_grid_idx:
                 def t1():
                     ts_op_setup(grid_idx)
@@ -751,17 +729,147 @@ def t1():
                     f_mw         = np.dot(qA.T, f_mw)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "u0", np.copy(f_mw))
                 
-                thread = threading.Thread(target=t1)
-                thread_pool.append(thread)
-                thread.start()
+                t1()
             
-            for thread in thread_pool:
-                thread.join()
-                    
+        return
             
+    def solve_step(self, time, delta_t):
+        """
+        perform a single timestep in 0d-BTE
+        """
+        rank                    = self.rankG
+        npes                    = self.npesG
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
         
+        for grid_idx in self.active_grid_idx:
+            dev_id  = gidx_to_device_map(grid_idx,n_grids)
+
+            def t1():
+                
+                # seting the E field for time t + dt (implicit step)
+                xp    = self.bte_solver.xp_module
+                eRe   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eRe")
+                eIm   = self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm")
+                Et    = eRe * xp.cos(2 * xp.pi * self.param.Efreq * (time + delta_t)) + eIm * xp.sin(2 * xp.pi * self.param.Efreq * (time + delta_t))
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "E", Et)
+                
+                u0    = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                v     = self.bte_solver.step(grid_idx, u0, self.param.atol, self.param.rtol, self.param.max_iter, time, delta_t)
+                self.bte_solver.set_boltzmann_parameter(grid_idx, "u1", v)
+
+            with cp.cuda.Device(dev_id):
+                t1()    
+        return 
+    
+    def solve(self):
+        """
+        Can be used to compute steady-state or cycle averaged BTE solutions
+        """
+        rank                    = self.rankG
+        npes                    = self.npesG
+        xp                      = self.xp_module
+        csv_write               = self.param.export_csv
+        plot_data               = self.param.plot_data
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        use_gpu                 = self.param.use_gpu
+        dev_id                  = self.param.dev_id
+        verbose                 = self.param.verbose
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        
+        self.qoi                = [None for grid_idx in range(self.param.n_grids)]
+        self.ff                 = [None for grid_idx in range(self.param.n_grids)]
+        coll_list               = self.bte_solver.get_collision_list()
+        coll_names              = self.bte_solver.get_collision_names()
+        
+        if csv_write: 
+            data_csv = np.empty((self.tps_npts, 8 + len(coll_list)))
+            
+        for grid_idx in self.active_grid_idx:
+            dev_id = gidx_to_device_map(grid_idx,n_grids)
+            
+            def t1():
+                try:
+                    print("rank [%d/%d] BTE launching grid %d on %s"%(rank, npes, grid_idx, dev_id), flush=True)
+                    f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                    ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
+                    self.ff[grid_idx]  = ff
+                    self.qoi[grid_idx] = qoi
+                except:
+                    print("rank [%d/%d] solver failed for v-space gird no %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                    sys.exit(-1)
+            
+            with cp.cuda.Device(dev_id):
+                t1()
         return
     
+    def push(self, interface):
+        xp                      = self.xp_module
+        n_grids                 = self.param.n_grids
+        gidx_to_device_map      = self.gidx_to_device_map
+        gidx_to_pidx_map        = self.grid_idx_to_spatial_idx_map
+        use_interp              = self.param.use_clstr_inp
+        
+        heavy_temp  = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+        tps_npts    = len(heavy_temp)
+        
+        n_reactions = interface.nComponents(libtps.t2bIndex.ReactionRates)
+        rates       = np.array(interface.HostWrite(libtps.t2bIndex.ReactionRates), copy=False).reshape((n_reactions, tps_npts))
+        
+        if (use_interp==True):
+            if(n_reactions>0):
+                rates[:,:] = 0.0
+                for grid_idx in self.active_grid_idx:
+                    dev_id = gidx_to_device_map(grid_idx,n_grids)
+                    
+                    def t1():
+                        qA        = self.bte_solver._op_diag_dg[grid_idx]
+                        u0        = self.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        h_curr    = self.bte_solver.normalized_distribution(grid_idx, u0)
+                        qoi       = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        rr_cpu    = xp.asnumpy(qoi["rates"])
+                        inp_mask  = xp.asnumpy(self.sub_cluster_c_lbl[grid_idx]) == np.arange(self.param.n_sub_clusters)[:, None]
+                        
+                        rr_interp = np.zeros((n_reactions, len(gidx_to_pidx_map[grid_idx])))
+                        
+                        for c_idx in range(self.param.n_sub_clusters):
+                            inp_idx = inp_mask[c_idx]
+                            for r_idx in range(n_reactions):
+                                rr_interp[r_idx, inp_idx] = rr_cpu[TPSINDEX.RR_IDX[r_idx]][c_idx] * self.param.N_Avo
+
+                        for r_idx in range(n_reactions):                            
+                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_interp[r_idx, :]
+                    
+                    with cp.cuda.Device(dev_id):
+                        t1()
+                        
+                rates = rates.reshape((-1))
+                rates[rates<0] = 0.0
+        else:
+            if(n_reactions>0):
+                rates[:,:] = 0.0
+                
+                for grid_idx in self.active_grid_idx:
+                    dev_id = gidx_to_device_map(grid_idx,n_grids)
+                    
+                    def t1():
+                        qA       = self.bte_solver._op_diag_dg[grid_idx]
+                        u0       = self.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                        h_curr   = self.bte_solver.normalized_distribution(grid_idx, u0)
+                        qoi      = self.bte_solver.compute_QoIs(grid_idx, h_curr, effective_mobility=False)
+                        rr_cpu   = xp.asnumpy(qoi["rates"])
+                        
+                        for r_idx in range(n_reactions):
+                            rates[r_idx][gidx_to_pidx_map[grid_idx]] = rr_cpu[TPSINDEX.RR_IDX[r_idx]] * self.param.N_Avo
+                        
+                    with cp.cuda.Device(dev_id):
+                        t1()
+                        
+                rates = rates.reshape((-1))
+                rates[rates<0] = 0.0
+        return 
+    
     async def fetch_asnyc(self, interface):
         xp                      = self.xp_module
         use_interp              = self.param.use_clstr_inp
@@ -1556,6 +1664,243 @@ def t1():
             comm.Barrier()
             return tps.getStatus()
 
+def driver_wo_parla(comm):
+    
+    rank = comm.Get_rank()
+    npes = comm.Get_size()
+
+    dev_id = rank % (cp.cuda.runtime.getDeviceCount())
+    
+    
+    with cp.cuda.Device(dev_id):
+        def __main__():
+            # TPS solver
+            profile_tt[pp.TPS_SETUP].start()
+            tps = libtps.Tps(comm)
+            tps.parseCommandLineArgs(sys.argv)
+            tps.parseInput()
+            tps.chooseDevices()
+            tps.chooseSolver()
+            tps.initialize()
+            profile_tt[pp.TPS_SETUP].stop()
+            
+            interface = libtps.Tps2Boltzmann(tps)
+            tps.initInterface(interface)
+            tps.solveBegin()
+            # --- first TPS step is needed to initialize the EM fields
+            tps.solveStep()
+            tps.push(interface)
+            
+            boltzmann = Boltzmann0D2VBactchedSolver(tps, comm)
+            rank      = boltzmann.comm.Get_rank()
+            npes      = boltzmann.comm.Get_size()
+            
+            profile_tt[pp.BTE_SETUP].start()
+            boltzmann.grid_setup(interface)
+            profile_tt[pp.BTE_SETUP].stop()
+            
+            boltzmann.solve_init()
+            xp        = boltzmann.bte_solver.xp_module
+            max_iters = boltzmann.param.tps_bte_max_iter
+            iter      = 0
+            tt        = 0
+            tau       = (1/boltzmann.param.Efreq)
+            dt_tps    = interface.timeStep()
+            dt_bte    = boltzmann.param.dt * tau 
+            bte_steps = int(dt_tps/dt_bte)
+            n_grids   = boltzmann.param.n_grids
+            
+            cycle_freq           = 1 #int(xp.ceil(tau/dt_tps))
+            terminal_output_freq = -1
+            gidx_to_device_map = boltzmann.gidx_to_device_map
+            
+            tps_sper_cycle = int(xp.ceil(tau/dt_tps))
+            bte_sper_cycle = int(xp.ceil(tau/dt_bte))
+            bte_max_cycles = int(boltzmann.param.cycles)
+            tps_max_cycles = boltzmann.param.bte_solve_freq
+            
+            if (boltzmann.rankG==0):
+                print("tps steps per cycle : ", tps_sper_cycle, "bte_steps per cycle", bte_sper_cycle, flush=True)
+                
+            while (iter<max_iters):
+                if (iter%cycle_freq==0):
+                    interface.saveDataCollection(cycle=(iter//cycle_freq), time=iter)
+                
+                # ########################## BTE solve ##################################################
+                profile_tt[pp.BTE_FETCH].start()
+                boltzmann.fetch(interface)
+                profile_tt[pp.BTE_FETCH].stop()
+                
+                if (boltzmann.param.solver_type=="steady-state"):
+                    
+                    profile_tt[pp.BTE_SOLVE].start()
+                    boltzmann.solve()
+                    profile_tt[pp.BTE_SOLVE].stop()
+                    
+                    profile_tt[pp.BTE_PUSH].start()
+                    
+                    for grid_idx in boltzmann.active_grid_idx:
+                        def t1():
+                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", boltzmann.ff[grid_idx])
+                        t1()
+                        
+                    boltzmann.push(interface)
+                    profile_tt[pp.BTE_PUSH].stop()
+                    
+                    if boltzmann.param.export_csv ==1:
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                            with cp.cuda.Device(dev_id):
+                                u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d_rank_%d_npes_%d"%(grid_idx, rank, npes))
+                
+                else:
+                    
+                    assert boltzmann.param.solver_type == "transient", "unknown BTE solver type"
+                    tt_bte       = 0
+                    bte_u        = [0 for i in range(n_grids)]
+                    bte_v        = [0 for i in range(n_grids)]
+                    
+                    u_avg        = [0 for i in range(n_grids)]
+                    
+                    abs_error    = [0 for i in range(n_grids)]
+                    rel_error    = [0 for i in range(n_grids)]
+                    cycle_f1     = (0.5 * dt_bte/ (bte_sper_cycle * dt_bte))
+                    
+                    for bte_idx in range(bte_sper_cycle * bte_max_cycles +1):
+                        if (bte_idx % bte_sper_cycle == 0):
+                            for grid_idx in boltzmann.active_grid_idx:
+                                dev_id = gidx_to_device_map(grid_idx,n_grids)
+                                def t1():
+                                    u0      = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                                    fname   = "%s_iter%04d_grid_%04d_cycle_%0d"%(boltzmann.param.out_fname, iter, grid_idx, bte_idx//bte_sper_cycle)
+                                    
+                                    abs_error[grid_idx] = xp.max(xp.abs(bte_v[grid_idx]-u0))
+                                    rel_error[grid_idx] = abs_error[grid_idx] / xp.max(xp.abs(u0))
+                                    #print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E"%(bte_idx, tt_bte, abs_error[grid_idx], rel_error[grid_idx]))
+                                    
+                                    # if(bte_idx >0):
+                                    #     print(grid_idx, " u_ptr ", u_avg[grid_idx].data, " v_ptr " , v_avg[grid_idx].data)
+                                    
+                                    bte_v[grid_idx] = xp.copy(u0)
+                                    
+                                with cp.cuda.Device(dev_id):
+                                    t1()
+
+                            p_t3 = min_mean_max(profile_tt[pp.BTE_SOLVE].snap, comm)
+                            print("[BTE] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E --- runtime = %.4E (s) "%(bte_idx, tt_bte, max(abs_error), max(rel_error), p_t3[2]), flush=True)
+                            
+                            if max(abs_error) < boltzmann.param.atol or max(rel_error)< boltzmann.param.rtol:
+                                break
+                            
+                            if bte_idx < bte_sper_cycle * bte_max_cycles:
+                                u_avg  = [0 for i in range(n_grids)]
+                                
+                        if bte_idx == bte_sper_cycle * bte_max_cycles :
+                            break    
+                            
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id = gidx_to_device_map(grid_idx,n_grids)
+                            def t1():
+                                u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
+                                
+                            with cp.cuda.Device(dev_id):
+                                t1()
+                        
+                        profile_tt[pp.BTE_SOLVE].start()
+                        boltzmann.solve_step(tt_bte, dt_bte)
+                        profile_tt[pp.BTE_SOLVE].stop()
+                        
+                        if(terminal_output_freq > 0 and bte_idx % terminal_output_freq ==0):
+                            p_t3 = min_mean_max(profile_tt[pp.BTE_SOLVE].snap, comm)
+                            print("[BTE] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (bte_idx, tt_bte, p_t3[0], p_t3[1], p_t3[2]))
+                        
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id = gidx_to_device_map(grid_idx,n_grids)
+                            def t1():
+                                u_avg[grid_idx] += cycle_f1 * boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1")
+                                boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u0", boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u1"))
+                            
+                            with cp.cuda.Device(dev_id):
+                                t1()
+                        
+                        tt_bte += dt_bte
+                    
+                    profile_tt[pp.BTE_PUSH].start()
+                    for grid_idx in boltzmann.active_grid_idx:
+                        dev_id = gidx_to_device_map(grid_idx,n_grids)
+                        
+                        def t1():
+                            xp               = boltzmann.xp_module
+                            qA               = boltzmann.bte_solver._op_diag_dg[grid_idx]
+                            u_avg[grid_idx]  = xp.dot(qA, u_avg[grid_idx])
+                            boltzmann.bte_solver.set_boltzmann_parameter(grid_idx, "u_avg", u_avg[grid_idx])
+                        
+                        with cp.cuda.Device(dev_id):
+                            t1()
+                    
+                    boltzmann.push(interface)
+                    profile_tt[pp.BTE_PUSH].stop()
+                    
+                    if boltzmann.param.export_csv ==1:
+                        for grid_idx in boltzmann.active_grid_idx:
+                            dev_id  = gidx_to_device_map(grid_idx,n_grids)
+                            with cp.cuda.Device(dev_id):
+                                u_vec   = boltzmann.bte_solver.get_boltzmann_parameter(grid_idx, "u_avg")
+                                boltzmann.io_output_data(grid_idx, u_vec, plot_data=True, export_csv=True, fname=boltzmann.param.out_fname+"_grid_%02d_rank_%d_npes_%d"%(grid_idx, rank, npes))
+                
+                ################### tps solve ######################################
+                profile_tt[pp.TPS_FETCH].start()
+                tps.fetch(interface)
+                profile_tt[pp.TPS_FETCH].stop()
+                
+                tps_u  = 0
+                tps_v  = 0
+                tt_tps = 0
+                for tps_idx in range(tps_sper_cycle * tps_max_cycles + 1):
+                    if (tps_idx % tps_sper_cycle == 0):
+                        tps.push(interface)
+                        nspecies            = interface.Nspecies()
+                        heavy_temp          = np.array(interface.HostRead(libtps.t2bIndex.HeavyTemperature), copy=False)
+                        tps_npts            = len(heavy_temp)
+                        tps_u               = np.array(interface.HostRead(libtps.t2bIndex.SpeciesDensities), copy=False).reshape(nspecies, tps_npts)
+                        # rates               = np.array(interface.HostRead(libtps.t2bIndex.ReactionRates), copy=False).reshape((1, tps_npts))
+                        # print("rates", np.min(rates[0]), np.max(rates[0]))
+                        
+                        abs_error           = np.linalg.norm(tps_u - tps_v, axis=1)
+                        rel_error           = abs_error / np.linalg.norm(tps_u, axis=1)
+                        tps_v               = np.copy(tps_u)
+                        
+                        p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
+                        print("[TPS] step = %04d time = %.4E ||u1 - u0|| = %.4E ||u0 - u1|| / ||u0|| = %.4E -- runtime = %.4E (s)"%(tps_idx, tt_tps, np.max(abs_error), np.max(rel_error), p_t3[2]), flush=True)
+                        # if (np.max(abs_error) < boltzmann.param.atol or np.max(rel_error) < max(1e-6,boltzmann.param.rtol)):
+                        #     break
+                    
+                    if (tps_idx == tps_sper_cycle * tps_max_cycles):
+                        break
+                    
+                    profile_tt[pp.TPS_SOLVE].start()
+                    tps.solveStep()
+                    profile_tt[pp.TPS_SOLVE].stop()
+                    if(terminal_output_freq > 0 and tps_idx % terminal_output_freq ==0):
+                        p_t3 = min_mean_max(profile_tt[pp.TPS_SOLVE].snap, comm)
+                        print("[TPS] %04d simulation time = %.4E cycle step (min) = %.4E (s) step (mean) = %.4E (s) step (max) = %.4E (s)" % (tps_idx,tt_tps, p_t3[0],p_t3[1],p_t3[2]), flush=True)
+                    tt_tps +=dt_tps
+                
+                profile_tt[pp.TPS_PUSH].start()
+                tps.push(interface)
+                profile_tt[pp.TPS_PUSH].stop()
+                
+                tt += dt_tps * tps_idx
+                iter+=1
+            
+            profile_stats(boltzmann, profile_tt, profile_nn, boltzmann.param.out_fname+"_profile.csv" , comm)
+            tps.solveEnd()
+            comm.Barrier()
+            return tps.getStatus()
+
+        __main__()
+
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
     driver_w_parla(comm)

From 3e180b8cbb1b7db571f1c5f2192f020ac3573705 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Tue, 19 Mar 2024 10:21:29 -0500
Subject: [PATCH 54/75] datetime stamp and parameters dump added to the timing
 output file

---
 src/tps-bte_0d3v.py | 48 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index 123763d6b..fb4608ce8 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -14,6 +14,9 @@
 import scipy.interpolate
 import scipy.cluster
 import threading
+import datetime
+# Use asynchronous stream ordered memory
+#cp.cuda.set_allocator(cp.cuda.MemoryAsyncPool().malloc)
 
 class profile_t:
     def __init__(self,name):
@@ -1205,11 +1208,13 @@ async def solve_async(self):
             @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
             def t1():
                 try:
+                    cp.cuda.nvtx.RangePush("bte_solve")
                     print("rank [%d/%d] BTE launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]), flush=True)
                     f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                     self.ff[grid_idx]  = ff
                     self.qoi[grid_idx] = qoi
+                    cp.cuda.nvtx.RangePop()
                 except:
                     print("rank [%d/%d] solver failed for v-space gird no %d"%(self.rankG, self.npesG, grid_idx), flush=True)
                     sys.exit(-1)
@@ -1375,6 +1380,42 @@ def asnumpy(a):
             
         return        
 
+    def params_dump(self):
+        params_dict = dict()
+        params_dict["sp_order"]         = self.param.sp_order
+        params_dict["spline_qpts"]      = self.param.spline_qpts
+        params_dict["Nr"]               = self.param.Nr
+        params_dict["l_max"]            = self.param.l_max
+        params_dict["ev_max"]           = self.param.ev_max
+        params_dict["n_grids"]          = self.param.n_grids
+        params_dict["n_sub_clusters"]   = self.param.n_sub_clusters
+        params_dict["dt"]               = self.param.dt
+        params_dict["cycles"]           = self.param.cycles
+        params_dict["solver_type"]      = self.param.solver_type
+        params_dict["atol"]             = self.param.atol
+        params_dict["rtol"]             = self.param.rtol
+        params_dict["max_iter"]         = self.param.max_iter
+        params_dict["tps_bte_max_iter"] = self.param.tps_bte_max_iter
+        params_dict["bte_solve_freq"]   = self.param.bte_solve_freq
+        params_dict["ee_collisions"]    = self.param.ee_collisions
+        params_dict["use_gpu"]          = self.param.use_gpu
+        params_dict["dev_id"]           = self.param.dev_id
+        params_dict["collisions"]       = self.param.collisions
+        params_dict["export_csv"]       = self.param.export_csv
+        params_dict["plot_data"]        = self.param.plot_data
+        params_dict["Efreq"]            = self.param.Efreq
+        params_dict["verbose"]          = self.param.verbose
+        params_dict["Te"]               = self.param.Te
+        params_dict["threads"]          = self.param.threads
+        params_dict["grid_idx"]         = self.param.grid_idx
+        params_dict["output_dir"]       = self.param.output_dir
+        params_dict["out_fname"]        = self.param.out_fname
+        params_dict["rand_seed"]        = self.param.rand_seed
+        params_dict["use_clstr_inp"]    = self.param.use_clstr_inp
+        
+        return params_dict
+        
+    
 class pp(enum.IntEnum):
     BTE_SETUP     = 0
     BTE_FETCH     = 1
@@ -1429,8 +1470,11 @@ def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn,
     if rank ==0 :
         if fname!="":
             with open(fname, "a") as f:
+                f.write(datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")+"\n")
+                f.write(""+str(boltzmann.params_dump())+"\n")
                 f.write(",".join(header)+"\n")
                 f.write(",".join(data_str)+"\n")
+                f.write("---" + "\n")
                 f.close()
         else:
             print(",".join(header))
@@ -1903,6 +1947,10 @@ def t1():
 
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
+    # print("running without parla")
+    # driver_wo_parla(comm)
+    
+    print("running with parla")
     driver_w_parla(comm)
     
             

From acfcab5c335c3d2d3052b68f15e50a0d988cc7be Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Tue, 19 Mar 2024 22:58:27 -0500
Subject: [PATCH 55/75] minor changes

---
 src/tps-bte_0d3v.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index fb4608ce8..b00705cc6 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -63,7 +63,7 @@ def min_mean_max(a, comm: MPI.Comm):
 from   bte_0d3v_batched import bte_0d3v_batched as BoltzmannSolver
 import utils as bte_utils
 
-WITH_PARLA = 1
+WITH_PARLA = 0
 if WITH_PARLA:
     try:
         from parla import Parla
@@ -1208,15 +1208,15 @@ async def solve_async(self):
             @spawn(ts[grid_idx], placement=[parla_placement[grid_idx]], vcus=0.0)
             def t1():
                 try:
-                    cp.cuda.nvtx.RangePush("bte_solve")
+                    #cp.cuda.nvtx.RangePush("bte_solve")
                     print("rank [%d/%d] BTE launching grid %d on %s"%(rank, npes, grid_idx, parla_placement[grid_idx]), flush=True)
                     f0 = self.bte_solver.get_boltzmann_parameter(grid_idx, "u0")
                     ff , qoi = self.bte_solver.solve(grid_idx, f0, self.param.atol, self.param.rtol, self.param.max_iter, self.param.solver_type)
                     self.ff[grid_idx]  = ff
                     self.qoi[grid_idx] = qoi
-                    cp.cuda.nvtx.RangePop()
-                except:
-                    print("rank [%d/%d] solver failed for v-space gird no %d"%(self.rankG, self.npesG, grid_idx), flush=True)
+                    #cp.cuda.nvtx.RangePop()
+                except Exception as e:
+                    print("rank [%d/%d] solver failed for v-space gird no %d with error = %s"%(self.rankG, self.npesG, grid_idx, str(e)), flush=True)
                     sys.exit(-1)
                     
         await ts
@@ -1947,11 +1947,11 @@ def t1():
 
 if __name__=="__main__":
     comm = MPI.COMM_WORLD
-    # print("running without parla")
-    # driver_wo_parla(comm)
+    print("running without parla")
+    driver_wo_parla(comm)
     
-    print("running with parla")
-    driver_w_parla(comm)
+    # print("running with parla")
+    # driver_w_parla(comm)
     
             
             

From 38b69be020d9d902843c5b8326e11538c35d520b Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Thu, 21 Mar 2024 09:16:24 -0500
Subject: [PATCH 56/75] nprocs added to the profile file

---
 src/tps-bte_0d3v.py | 56 +++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index b00705cc6..f00aa7dcc 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -121,8 +121,10 @@ class BoltzmannSolverParams():
     
     n0            = 3.22e22 #[m^{-3}]
     
-    rand_seed     = 0
-    use_clstr_inp = True
+    rand_seed       = 0
+    use_clstr_inp   = True
+    clstr_maxiter   = 10
+    clstr_threshold = 1e-3
     
 class TPSINDEX():
     """
@@ -525,7 +527,7 @@ def t1():
                     # xp                           = cp
                     # m                            = xp.array(m_bte[gidx_to_pidx[grid_idx]])
                     # mw , mw_std                  = normalize(m, xp)
-                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=1000, thresh=1e-8, rand_seed=self.param.rand_seed, xp=xp)
+                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=self.param.clstr_maxiter, thresh=self.param.clstr_threshold, rand_seed=self.param.rand_seed, xp=xp)
                     
                     # to repoduce clusters
                     xp                           = np
@@ -533,7 +535,7 @@ def t1():
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
                     mw , mw_std                  = normalize(m, xp)
                     mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
-                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
+                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=self.param.clstr_maxiter, thresh=self.param.clstr_threshold, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0
                     dist_mat                     = xp.linalg.norm(mw[:, None, :] - mcw[None, : , :], axis=2)
@@ -635,16 +637,16 @@ def t1():
                             print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
             
             
-                    # if (use_gpu == 1):
-                    #     with cp.cuda.Device(dev_id):
-                    #         n0   = cp.array(n0) 
-                    #         ne   = cp.array(ne)
-                    #         ni   = cp.array(ni)
-                    #         Ex   = cp.array(Ex)
-                    #         Ey   = cp.array(Ey)
-                    #         Tg   = cp.array(Tg)
-                    #         EMag = cp.sqrt(Ex**2 + Ey**2)
-                    #         ns_by_n0 = cp.array(ns_by_n0)
+                    if (use_gpu == 1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
                     
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
@@ -932,7 +934,7 @@ def t1():
                     # xp                           = cp
                     # m                            = xp.array(m_bte[gidx_to_pidx[grid_idx]])
                     # mw , mw_std                  = normalize(m, xp)
-                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=1000, thresh=1e-8, rand_seed=self.param.rand_seed, xp=xp)
+                    # mcw, membership_m            = k_means(mw, num_clusters=self.param.n_sub_clusters, max_iter=self.param.clstr_maxiter, thresh=1e-8, rand_seed=self.param.rand_seed, xp=xp)
                     
                     # to repoduce clusters
                     xp                           = np
@@ -940,7 +942,7 @@ def t1():
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
                     mw , mw_std                  = normalize(m, xp)
                     mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
-                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=1000, thresh=1e-8, check_finite=False)[0]
+                    mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=self.param.clstr_maxiter, thresh=self.param.clstr_threshold, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0
                     dist_mat                     = xp.linalg.norm(mw[:, None, :] - mcw[None, : , :], axis=2)
@@ -1041,16 +1043,16 @@ def t1():
                             print("ns/n0 (min)               = %.12E              \t ns/n0(max) = %.12E         "%(xp.min(ns_by_n0[i]) , xp.max(ns_by_n0[i])), flush=True)
             
             
-                    # if (use_gpu == 1):
-                    #     with cp.cuda.Device(dev_id):
-                    #         n0   = cp.array(n0) 
-                    #         ne   = cp.array(ne)
-                    #         ni   = cp.array(ni)
-                    #         Ex   = cp.array(Ex)
-                    #         Ey   = cp.array(Ey)
-                    #         Tg   = cp.array(Tg)
-                    #         EMag = cp.sqrt(Ex**2 + Ey**2)
-                    #         ns_by_n0 = cp.array(ns_by_n0)
+                    if (use_gpu == 1):
+                        with cp.cuda.Device(dev_id):
+                            n0   = cp.array(n0) 
+                            ne   = cp.array(ne)
+                            ni   = cp.array(ni)
+                            Ex   = cp.array(Ex)
+                            Ey   = cp.array(Ey)
+                            Tg   = cp.array(Tg)
+                            EMag = cp.sqrt(Ex**2 + Ey**2)
+                            ns_by_n0 = cp.array(ns_by_n0)
                     
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "ns_by_n0", ns_by_n0)
                     self.bte_solver.set_boltzmann_parameter(grid_idx, "n0" , n0)
@@ -1470,7 +1472,7 @@ def profile_stats(boltzmann:Boltzmann0D2VBactchedSolver, p_tt: profile_t, p_nn,
     if rank ==0 :
         if fname!="":
             with open(fname, "a") as f:
-                f.write(datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")+"\n")
+                f.write("nprocs: %d timestamp %s \n"%(npes, datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")))
                 f.write(""+str(boltzmann.params_dump())+"\n")
                 f.write(",".join(header)+"\n")
                 f.write(",".join(data_str)+"\n")

From ba72be043e80134763c4dfd6572658fa532ac015 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Thu, 21 Mar 2024 12:30:48 -0500
Subject: [PATCH 57/75] bte fetch when cluster samples < total tps points we
 need to do sampling with replacement

---
 src/tps-bte_0d3v.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index f00aa7dcc..d6c596a9f 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -534,7 +534,12 @@ def t1():
                     np.random.seed(self.param.rand_seed)
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
                     mw , mw_std                  = normalize(m, xp)
-                    mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    
+                    if mw.shape[0] >= self.param.n_sub_clusters:
+                        mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    else:
+                        mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=True)]
+                    
                     mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=self.param.clstr_maxiter, thresh=self.param.clstr_threshold, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0
@@ -941,7 +946,12 @@ def t1():
                     np.random.seed(self.param.rand_seed)
                     m                            = m_bte[gidx_to_pidx[grid_idx]]
                     mw , mw_std                  = normalize(m, xp)
-                    mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    
+                    if mw.shape[0] >= self.param.n_sub_clusters:
+                        mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=False)]
+                    else:
+                        mcw0                         = mw[np.random.choice(mw.shape[0], self.param.n_sub_clusters, replace=True)]
+                        
                     mcw                          = scipy.cluster.vq.kmeans(mw, mcw0, iter=self.param.clstr_maxiter, thresh=self.param.clstr_threshold, check_finite=False)[0]
                     mcw0[0:mcw.shape[0], :]      = mcw[:,:]
                     mcw                          = mcw0

From 94d3b9a30a08774918252b7dda179fa7e33e1341 Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Wed, 20 Mar 2024 21:23:36 -0500
Subject: [PATCH 58/75] Add logic to read distance fcn from restart if
 requested

---
 src/M2ulPhyS.cpp          | 15 ++++++++++++++-
 src/run_configuration.hpp |  1 +
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index 99c56f144..c67746904 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -405,7 +405,14 @@ void M2ulPhyS::initVariables() {
     serial_mesh->GetNodes(coordinates);
 
     // Evaluate the distance function
-    evaluateDistanceSerial(*serial_mesh, wall_patch_list, coordinates, *serial_distance);
+    if (!config.read_distance || !config.GetRestartCycle()) {
+      if (rank0_) grvy_printf(ginfo, "Computing distance function\n");
+      evaluateDistanceSerial(*serial_mesh, wall_patch_list, coordinates, *serial_distance);
+    } else {
+      // If distance function is read from restart, this will be overwritten later
+      if (rank0_) grvy_printf(ginfo, "Distance function to be read from restart\n");
+      *serial_distance = 0.0;
+    }
     delete tmp_dfes;
   }
 
@@ -617,6 +624,11 @@ void M2ulPhyS::initVariables() {
 #endif
   initSolutionAndVisualizationVectors();
 
+  if (distance_ != NULL) {
+    ioData.registerIOFamily("Distance function", "/distance", distance_, false, config.read_distance);
+    ioData.registerIOVar("/distance", "distance", 0, config.read_distance);
+  }
+
   average = new Averaging(Up, mesh, fec, fes, dfes, vfes, eqSystem, d_mixture, num_equation, dim, config, groupsMPI);
   average->read_meanANDrms_restart_files();
 
@@ -2657,6 +2669,7 @@ void M2ulPhyS::parseFlowOptions() {
   }
   tpsP->getInput("flow/refinement_levels", config.ref_levels, 0);
   tpsP->getInput("flow/computeDistance", config.compute_distance, false);
+  tpsP->getInput("flow/readDistance", config.read_distance, false);
 
   std::string type;
   tpsP->getInput("flow/sgsModel", type, std::string("none"));
diff --git a/src/run_configuration.hpp b/src/run_configuration.hpp
index 6cd5e0e5f..1a6aa00c5 100644
--- a/src/run_configuration.hpp
+++ b/src/run_configuration.hpp
@@ -286,6 +286,7 @@ class RunConfiguration {
   PostProcessInput postprocessInput;
 
   bool compute_distance;
+  bool read_distance;
 
   RunConfiguration();
   ~RunConfiguration();

From 9611699dad4c37e17bd11343cda74a682870165e Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Thu, 21 Mar 2024 16:20:56 -0500
Subject: [PATCH 59/75] nvtx tags added for ncu profiling

---
 src/tps.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/tps.py b/src/tps.py
index 46b69aa14..e9281a23b 100755
--- a/src/tps.py
+++ b/src/tps.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import sys
 import os
-
+import cupy as cp
 from mpi4py import MPI
 
 # set path to C++ TPS library
@@ -18,6 +18,12 @@
 tps.chooseDevices()
 tps.chooseSolver()
 tps.initialize()
-tps.solve()
+tps.solveStep()
+
+#cp.profiler.start()
+cp.cuda.nvtx.RangePush("tpsStep")
+tps.solveStep()
+cp.cuda.nvtx.RangePop()
+#tps.solve()
 
 sys.exit (tps.getStatus())

From 1a3f5b45597ff521a871227019e22a031f79b9f0 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 22 Mar 2024 14:56:01 -0500
Subject: [PATCH 60/75] Bugfix in test/vpath.sh so that tps-bte_0d3v.py is now
 also dynamically linked

---
 test/vpath.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/vpath.sh b/test/vpath.sh
index 26d070bc1..af1e5eead 100755
--- a/test/vpath.sh
+++ b/test/vpath.sh
@@ -31,7 +31,7 @@ fi
 
 # necessary binaries
 binaries="bats die.sh soln_differ count_gpus.sh sniff_mpirun.sh "
-binaries+="../src/tps.py ../src/tps-time-loop.py ../cdsrc/tps-bte_0d3v.py ../test/test_tps_splitcomm.py"
+binaries+="../src/tps.py ../src/tps-time-loop.py ../src/tps-bte_0d3v.py ../test/test_tps_splitcomm.py"
 for binary in $binaries; do
     if [ ! -x $binary ];then
         if [ -x $testDir/$binary ];then

From 239a90442ae8ca59fda168ea3a18293b6b9395fd Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Sat, 23 Mar 2024 14:43:55 -0500
Subject: [PATCH 61/75] Exchange face neighbor data before we compute
 primitives

After restart, make sure all data valid before using it.  I don't
think this could ever cause an incorrect result for a time step b/c
the info should be exchanged prior to all calculations that influence
the step.  But, it can lead to (and has recently led to) to failing
asserts in the primitive variable calculations.
---
 src/M2ulPhyS.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index c67746904..73febee17 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -1942,6 +1942,10 @@ void M2ulPhyS::projectInitialSolution() {
 
   initGradUp();
 
+  // Exchange before computing primitives
+  U->ParFESpace()->ExchangeFaceNbrData();
+  U->ExchangeFaceNbrData();
+
   updatePrimitives();
 
   // update pressure grid function

From f3b5a0ed257f7cbf50d0db9cc5566b9c72d6b6ad Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Sat, 23 Mar 2024 14:49:00 -0500
Subject: [PATCH 62/75] Bug fix: move mesh->ExchangeFaceNbrNodes() and
 mesh->ExchangeFaceNbrData() out of conditional

These are necessary in the boundary data section of
M2ulPhyS::initIndirectionArrays().  However, previously they were
called inside of the conditional if (NumBCelems > 0), which leads to a
problem there are some mpi ranks that have no boundary elements.
---
 src/M2ulPhyS.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index 73febee17..f9d48b304 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -1077,6 +1077,12 @@ void M2ulPhyS::initIndirectionArrays() {
   // See #199 for more info.
   const int NumBCelems = fes->GetNBE();
 
+  // NB: *Must* call this here, as otherwise some faces are
+  // erroneously included as boundary faces and asserts below may
+  // fail
+  mesh->ExchangeFaceNbrNodes();
+  mesh->ExchangeFaceNbrData();
+
   if (NumBCelems > 0) {
     bdry_face_data.shape.UseDevice(true);
     bdry_face_data.shape.SetSize(NumBCelems * maxIntPoints * maxDofs);
@@ -1120,12 +1126,6 @@ void M2ulPhyS::initIndirectionArrays() {
     FaceElementTransformations *tr;
     // Mesh *mesh = fes->GetMesh();
 
-    // NB: *Must* call this here, as otherwise some faces are
-    // erroneously included as boundary faces and asserts below may
-    // fail
-    mesh->ExchangeFaceNbrNodes();
-    mesh->ExchangeFaceNbrData();
-
     std::vector<int> uniqueElems;
     uniqueElems.clear();
 

From d8df437f115ad4b1c8b8543133d92c31aa7da371 Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Sat, 23 Mar 2024 14:52:20 -0500
Subject: [PATCH 63/75] Fix U->HostWrite that should be U->HostRead in
 M2ulPhyS::updatePrimitives

---
 src/M2ulPhyS.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/M2ulPhyS.cpp b/src/M2ulPhyS.cpp
index f9d48b304..78d411d2b 100644
--- a/src/M2ulPhyS.cpp
+++ b/src/M2ulPhyS.cpp
@@ -4019,8 +4019,7 @@ void M2ulPhyS::checkSolverOptions() const {
 }
 
 void M2ulPhyS::updatePrimitives() {
-  // U.V.: should this be U->HostRead() instead? U->HostWrite() does not sync memory before returning the pointer.
-  double *data = U->HostWrite();
+  const double *data = U->HostRead();
   double *dataUp = Up->HostWrite();
   int dof = vfes->GetNDofs();
 

From b3e801cc72953e0723b1a9bbb37eca12f9dbdf55 Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Sat, 23 Mar 2024 17:06:53 -0500
Subject: [PATCH 64/75] Fix uninitialized species in outlet BC ctor

Can cause problems when mixture->GetConservativesFromPrimitives(iUp,
iState) gets called below.
---
 src/outletBC.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/outletBC.cpp b/src/outletBC.cpp
index 705a33598..e7d50ea39 100644
--- a/src/outletBC.cpp
+++ b/src/outletBC.cpp
@@ -74,6 +74,12 @@ OutletBC::OutletBC(MPI_Groups *_groupsMPI, Equations _eqSystem, RiemannSolver *_
   hmeanUp[1 + nvel_] = 300.0;  // 101300;
   if (eqSystem == NS_PASSIVE) hmeanUp[num_equation_ - 1] = 0.;
 
+  if (mixture->GetNumActiveSpecies() > 0) {
+    for (int sp = 0; sp < mixture->GetNumActiveSpecies() > 0; sp++) {
+      hmeanUp[nvel_ + 2 + sp] = 0.0;
+    }
+  }
+
   area_ = 0.;
   parallelAreaComputed = false;
 

From d772a0248577f8d2d36b64b0817b99ce6f934988 Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Sun, 24 Mar 2024 18:04:38 -0500
Subject: [PATCH 65/75] thesholding E to avoid E=0 case for the steady-state
 solver, because when E=0 the steady-state solution would be the delta
 function

---
 src/tps-bte_0d3v.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index d6c596a9f..f6d1e4e13 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -126,6 +126,8 @@ class BoltzmannSolverParams():
     clstr_maxiter   = 10
     clstr_threshold = 1e-3
     
+    EMag_threshold  = 1e-10
+    
 class TPSINDEX():
     """
     simple index map to differnt fields, from the TPS arrays
@@ -493,10 +495,16 @@ def fetch(self, interface):
         
         Ex                      = efield[0]
         Ey                      = efield[1]
-    
+        
+        EMag                    = np.sqrt(Ex**2 + Ey**2)
+        e_idx                   = EMag<self.param.EMag_threshold
+        
         ExbyN                   = Ex/n0/self.param.Td_fac
         EybyN                   = Ey/n0/self.param.Td_fac
         
+        ExbyN[e_idx]            = (self.param.EMag_threshold/np.sqrt(2)) / n0[e_idx] / self.param.Td_fac
+        EybyN[e_idx]            = (self.param.EMag_threshold/np.sqrt(2)) / n0[e_idx] / self.param.Td_fac
+        
         Ex                      = ExbyN * self.param.n0 * self.param.Td_fac
         Ey                      = EybyN * self.param.n0 * self.param.Td_fac
         
@@ -904,13 +912,19 @@ async def fetch_asnyc(self, interface):
         
         Ex                      = efield[0]
         Ey                      = efield[1]
-    
+        
+        EMag                    = np.sqrt(Ex**2 + Ey**2)
+        e_idx                   = EMag<self.param.EMag_threshold
+        
         ExbyN                   = Ex/n0/self.param.Td_fac
         EybyN                   = Ey/n0/self.param.Td_fac
         
+        ExbyN[e_idx]            = (self.param.EMag_threshold/np.sqrt(2)) / n0[e_idx] / self.param.Td_fac
+        EybyN[e_idx]            = (self.param.EMag_threshold/np.sqrt(2)) / n0[e_idx] / self.param.Td_fac
+        
         Ex                      = ExbyN * self.param.n0 * self.param.Td_fac
         Ey                      = EybyN * self.param.n0 * self.param.Td_fac
-        
+    
         ion_deg                 = species_densities[TPSINDEX.ELE_IDX]/n0
         ion_deg[ion_deg<=0]     = 1e-16
         ns_by_n0[ns_by_n0<=0]   = 0
@@ -1377,7 +1391,7 @@ def asnumpy(a):
                 for ii in range(0, n_pts, n_pts_step):
                     fr     = np.abs(ff_r[ii, lm_idx, :])
                     mf_str = " ".join([r"$%s/n0$=%.2E"%(s, ns_by_n0[ii, s_idx]) for s_idx, s in enumerate(cs_species)])
-                    plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td]"%(Tg[ii], eMag[ii]/n0[ii]/1e-21) + " " +mf_str)
+                    plt.semilogy(ev, fr, label=r"$T_g$=%.2E [K], $E/n_0$=%.2E [Td] $n_e/n_0$=%.2E "%(Tg[ii], eMag[ii]/n0[ii]/1e-21, ne[ii]/n0[ii]) + " " +mf_str)
                 
                 plt.xlabel(r"energy (eV)")
                 plt.ylabel(r"$f_%d$"%(lm[0]))

From 85d2f4964103cfb5c90139319af0d8a45ffa1d17 Mon Sep 17 00:00:00 2001
From: Umberto Emanuele Villa <villa13@tioga11.llnl.gov>
Date: Mon, 25 Mar 2024 09:25:10 -0700
Subject: [PATCH 66/75] Restore tps.py and add tps-ntvx-profile.py

---
 src/tps-ntvx-profile.py | 29 +++++++++++++++++++++++++++++
 src/tps.py              |  8 +-------
 2 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100755 src/tps-ntvx-profile.py

diff --git a/src/tps-ntvx-profile.py b/src/tps-ntvx-profile.py
new file mode 100755
index 000000000..e9281a23b
--- /dev/null
+++ b/src/tps-ntvx-profile.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import sys
+import os
+import cupy as cp
+from mpi4py import MPI
+
+# set path to C++ TPS library
+path = os.path.abspath(os.path.dirname(sys.argv[0]))
+sys.path.append(path + "/.libs")
+import libtps as tps
+
+comm = MPI.COMM_WORLD
+# TPS solver
+tps = tps.Tps(comm)
+
+tps.parseCommandLineArgs(sys.argv)
+tps.parseInput()
+tps.chooseDevices()
+tps.chooseSolver()
+tps.initialize()
+tps.solveStep()
+
+#cp.profiler.start()
+cp.cuda.nvtx.RangePush("tpsStep")
+tps.solveStep()
+cp.cuda.nvtx.RangePop()
+#tps.solve()
+
+sys.exit (tps.getStatus())
diff --git a/src/tps.py b/src/tps.py
index e9281a23b..c92f658ef 100755
--- a/src/tps.py
+++ b/src/tps.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 import sys
 import os
-import cupy as cp
 from mpi4py import MPI
 
 # set path to C++ TPS library
@@ -19,11 +18,6 @@
 tps.chooseSolver()
 tps.initialize()
 tps.solveStep()
-
-#cp.profiler.start()
-cp.cuda.nvtx.RangePush("tpsStep")
-tps.solveStep()
-cp.cuda.nvtx.RangePop()
-#tps.solve()
+tps.solve()
 
 sys.exit (tps.getStatus())

From f7b63eec8d88a4a58afeaa37fdb63fd204c8c874 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Mon, 25 Mar 2024 11:38:44 -0500
Subject: [PATCH 67/75] Update gpu_constructor.cpp

spelling
---
 src/gpu_constructor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu_constructor.cpp b/src/gpu_constructor.cpp
index 6dbb5de1e..bf58b88a4 100644
--- a/src/gpu_constructor.cpp
+++ b/src/gpu_constructor.cpp
@@ -122,7 +122,7 @@ __global__ void freeDeviceRadiation(Radiation *radiation) {
 }
 
 //---------------------------------------------------
-// And finally devise setters
+// And finally device setters
 //---------------------------------------------------
 __global__ void deviceSetGridFunctionReactionData(const double * data, int size, GridFunctionReaction * reaction) {
   reaction->setData(data, size);

From aafa6da3bbf7a85f4e153708272d20a2d61c70c0 Mon Sep 17 00:00:00 2001
From: Umberto Emanuele Villa <villa13@tioga11.llnl.gov>
Date: Mon, 25 Mar 2024 11:02:57 -0700
Subject: [PATCH 68/75] enable mpirun

---
 test/mms.euler.test            | 24 ++++++++++++++++++++++--
 test/mms.ternary_2d.test       | 13 ++++++++++++-
 test/mms.ternary_2d_inout.test | 15 ++++++++++++++-
 test/mms.ternary_2d_wall.test  |  4 +++-
 4 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/test/mms.euler.test b/test/mms.euler.test
index 4e0ec5a08..dd5e6bd6f 100755
--- a/test/mms.euler.test
+++ b/test/mms.euler.test
@@ -5,17 +5,31 @@ TEST="mms/euler"
 RUNFILE_1="inputs/mms.euler.3d.r1.ini"
 RUNFILE_2="inputs/mms.euler.3d.r2.ini"
 
+setup() {
+    NUM_GPUS=`./count_gpus.sh`
+    MPIRUN=`./sniff_mpirun.sh`
+}
+
 @test "[$TEST] run tps with input -> $RUNFILE_1" {
-    mpirun -np 2 ../src/tps --runFile $RUNFILE_1 >& euler_mms_r1.log
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
+    $MPIRUN -n 2 ../src/tps --runFile $RUNFILE_1 >& euler_mms_r1.log
 }
 
 @test "[$TEST] run tps with input -> $RUNFILE_2" {
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     rm -f $SOLN_FILE
     touch DIE
-    mpirun -np 2 ../src/tps --runFile $RUNFILE_2 >& euler_mms_r2.log
+    $MPIRUN -n 2 ../src/tps --runFile $RUNFILE_2 >& euler_mms_r2.log
 }
 
 @test "[$TEST] verify tps density convergence rate for Euler MMS with linear elems" {
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     rho_err_r1=$(cat euler_mms_r1.log | grep "time step: 300" | head -1 | awk '{print $9}')
     rho_err_r2=$(cat euler_mms_r2.log | grep "time step: 600" | head -1 | awk '{print $9}')
 
@@ -35,6 +49,9 @@ RUNFILE_2="inputs/mms.euler.3d.r2.ini"
 }
 
 @test "[$TEST] verify tps velocity convergence rate for Euler MMS with linear elems" {
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     vel_err_r1=$(cat euler_mms_r1.log | grep "time step: 300" | head -1 | awk '{print $11}')
     vel_err_r2=$(cat euler_mms_r2.log | grep "time step: 600" | head -1 | awk '{print $11}')
 
@@ -54,6 +71,9 @@ RUNFILE_2="inputs/mms.euler.3d.r2.ini"
 }
 
 @test "[$TEST] verify tps pressure convergence rate for Euler MMS with linear elems" {
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     pre_err_r1=$(cat euler_mms_r1.log | grep "time step: 300" | head -1 | awk '{print $13}')
     pre_err_r2=$(cat euler_mms_r2.log | grep "time step: 600" | head -1 | awk '{print $13}')
 
diff --git a/test/mms.ternary_2d.test b/test/mms.ternary_2d.test
index c2654e2bd..e14120010 100755
--- a/test/mms.ternary_2d.test
+++ b/test/mms.ternary_2d.test
@@ -8,6 +8,9 @@ setup() {
     SOLN_FILE=restart_argon_output.sol.h5
     MESH_FILE=beam-quad-o3-s1-r1-p.mesh
     TOL=2e-4
+
+    NUM_GPUS=`./count_gpus.sh`
+    MPIRUN=`./sniff_mpirun.sh`
 }
 
 @test "[$TEST] check for input file $RUNFILE" {
@@ -21,10 +24,18 @@ setup() {
 }
 
 @test "[$TEST] run tps with input -> $RUNFILE" {
-    mpirun -np 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.log
+
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
+    $MPIRUN -n 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.log
 }
 
 @test "[$TEST] check if the relative error is similar to the reported value" {
+
+    [ $NUM_GPUS -ge 2 ] || skip "Two GPUs not available"
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     while IFS=$'\t' read -r nx e0 e1 e2 e3 e4 e5;
     do
       test $nx -eq 100
diff --git a/test/mms.ternary_2d_inout.test b/test/mms.ternary_2d_inout.test
index 361cf354a..fce5de441 100755
--- a/test/mms.ternary_2d_inout.test
+++ b/test/mms.ternary_2d_inout.test
@@ -7,6 +7,13 @@ RUNFILE="inputs/mms.ternary_plasma.2d.inout.ini"
 setup() {
     SOLN_FILE=restart_argon_output.sol.h5
     MESH_FILE=beam-quad-o3-s1-r1-yp.mesh
+
+    SKIP="ASPEED"  
+    NUM_GPUS=`./count_gpus.sh`
+    MPIRUN=`./sniff_mpirun.sh`
+
+    echo Number of GPUS: $NUM_GPUS
+    echo mpirun: $MPIRUN
 }
 
 @test "[$TEST] check for input file $RUNFILE" {
@@ -20,10 +27,16 @@ setup() {
 }
 
 @test "[$TEST] run tps with input -> $RUNFILE" {
-    mpirun -np 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.inout.log
+
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
+    $MPIRUN -n 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.inout.log
 }
 
 @test "[$TEST] check if the relative error is similar to the reported value" {
+
+    [ "x$MPIRUN" != "x" ] || skip "Cannot launch parallel job"
+
     while IFS=$'\t' read -r nx e0 e1 e2 e3 e4 e5;
     do
       test $nx -eq 100
diff --git a/test/mms.ternary_2d_wall.test b/test/mms.ternary_2d_wall.test
index e71c962c1..8902fbe6a 100755
--- a/test/mms.ternary_2d_wall.test
+++ b/test/mms.ternary_2d_wall.test
@@ -7,6 +7,8 @@ RUNFILE="inputs/mms.ternary_plasma.2d.wall.ini"
 setup() {
     SOLN_FILE=restart_argon_output.sol.h5
     MESH_FILE=beam-quad-o3-s1-r1-xp.mesh
+
+    MPIRUN=`./sniff_mpirun.sh`
 }
 
 @test "[$TEST] check for input file $RUNFILE" {
@@ -20,7 +22,7 @@ setup() {
 }
 
 @test "[$TEST] run tps with input -> $RUNFILE" {
-    mpirun -np 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.wall.log
+    $MPIRUN -n 2 ../src/tps --runFile $RUNFILE >& plasma_ternary_mms.wall.log
 }
 
 @test "[$TEST] check if the relative error is similar to the reported value" {

From e45aaf3dab2934d23dcf96470a9846d079b06829 Mon Sep 17 00:00:00 2001
From: Umberto Emanuele Villa <villa13@tioga11.llnl.gov>
Date: Mon, 25 Mar 2024 11:56:19 -0700
Subject: [PATCH 69/75] Add a command line parameter for GPU-aware MPI

---
 src/tps.cpp | 24 ++++++++++++++++++++----
 src/tps.hpp |  1 +
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/tps.cpp b/src/tps.cpp
index 35bdcff95..8691dfd92 100644
--- a/src/tps.cpp
+++ b/src/tps.cpp
@@ -138,7 +138,9 @@ void Tps::parseCommandLineArgs(int argc, char *argv[]) {
   args.AddOption(&debugMode, "-d", "--debug", "", "--no-debug", "Launch in debug mode for gdb attach.");
   args.AddOption(&visualMode, "-visual", "--visualization", "", "--no-visualization",
                  "Launch post-process visualization.");
-
+  gpu_aware_mpi_=false;
+  args.AddOption(&gpu_aware_mpi_, "-ga", "--gpu-aware-mpi", "", "--no-gpu-aware-mpi",
+		 "Set GPU-aware MPI."); 
   args.Parse();
 
   if (!args.Good()) {
@@ -193,11 +195,25 @@ void Tps::chooseDevices() {
 
   int mpi_gpu_aware = 0;  // false;
 #if _CUDA_ && defined(MPIX_CUDA_AWARE_SUPPORT)
-  // check for cuda-aware mpi (if possible)
+  // check for cuda-aware mpi (if possible) and overwrite flag if needed
+  // Trust the command line flag if  MPIX_Query_cuda_support is not available
   mpi_gpu_aware = MPIX_Query_cuda_support();
+
+  if (mpi_gpu_aware == 1 && gpu_aware_mpi_ == false) {
+    if (isRank0_) {
+      grvy_printf(GRVY_WARNING, "Cuda-aware MPI detected, but flag is false")
+    }
+    gpu_aware_mpi_=true;
+  } else if (mpi_gpu_aware == 0 && gpu_aware_mpi_ == true) {
+    if (isRank0_) {
+      grvy_printf(GRVY_WARNING, "No cuda-aware MPI detected, but flag is true")
+    }
+    gpu_aware_mpi_=false;
+  }
+
 #endif
 
-  device_.SetGPUAwareMPI(mpi_gpu_aware);
+  device_.SetGPUAwareMPI(gpu_aware_mpi_);
 #endif
 
   if (isRank0_) {
@@ -209,7 +225,7 @@ void Tps::chooseDevices() {
 
 #ifdef _GPU_
   if (isRank0_) {
-    if (mpi_gpu_aware) {
+    if (gpu_aware_mpi_) {
       grvy_printf(GRVY_INFO, "\nTPS is using GPU-aware MPI.\n");
     } else {
       grvy_printf(GRVY_INFO, "\nTPS is using non-GPU-aware MPI.\n");
diff --git a/src/tps.hpp b/src/tps.hpp
index 544c21fe8..9b40ff6aa 100644
--- a/src/tps.hpp
+++ b/src/tps.hpp
@@ -81,6 +81,7 @@ class Tps {
   std::string iFile_;              // name of runtime input file (new ini format)
   std::string input_solver_type_;  // choice of desired solver
   int numGpusPerRank_;             // number of GPUs to use per MPI rank
+  bool gpu_aware_mpi_;             // whether MPI is gpu-aware (default: false)
 
   // execution device controls
   std::string deviceConfig_;

From c3c0e6c0c42e79f9e63890e5fd95f08a4f4fb46d Mon Sep 17 00:00:00 2001
From: "Todd A. Oliver" <oliver@oden.utexas.edu>
Date: Thu, 28 Mar 2024 19:41:06 -0700
Subject: [PATCH 70/75] Ensure SpongeZone::sigma is properly initialized

In the SpongeZone ctor we have

sigma = new ParGridFunction(&fes);
*sigma = 0.;
double *hSigma = sigma->HostWrite();

and the subsequent loop (prior to this commit) only set some entries
in hSigma.  On systems where a device is available, this process can
lead to uninitialized values in the sigma field, b/c *sigma = 0
initializes the device memory and `sigma->HostWrite` immediately
invalidates that (i.e., no copy is done) but then hSigma isn't fully
initialized.

In this commit, we eliminate the *sigma = 0. initialization in favor
of setting all entries within the loop.
---
 src/forcing_terms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/forcing_terms.cpp b/src/forcing_terms.cpp
index 3bd2da71a..c7f1603c7 100644
--- a/src/forcing_terms.cpp
+++ b/src/forcing_terms.cpp
@@ -540,7 +540,6 @@ SpongeZone::SpongeZone(const int &_dim, const int &_num_equation, const int &_or
   ParFiniteElementSpace fes(mesh, fec);
 
   sigma = new ParGridFunction(&fes);
-  *sigma = 0.;
   double *hSigma = sigma->HostWrite();
 
   ParGridFunction coords(&dfes);
@@ -560,6 +559,7 @@ SpongeZone::SpongeZone(const int &_dim, const int &_num_equation, const int &_or
     Vector Xn(dim);
     for (int d = 0; d < dim; d++) Xn[d] = coords[n + d * ndofs];
 
+    hSigma[n] = 0.0;
     if (szData.szType == SpongeZoneType::PLANAR) {
       // distance to the mix-out plane
       double distInit = 0.;

From 64e0d1e791cf07fdde77203df2b365636d28075a Mon Sep 17 00:00:00 2001
From: milindasf <milinda@cs.utah.edu>
Date: Tue, 9 Apr 2024 13:09:46 -0500
Subject: [PATCH 71/75] Ex, Ey component output for csv files.

---
 src/tps-bte_0d3v.py | 45 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/tps-bte_0d3v.py b/src/tps-bte_0d3v.py
index f6d1e4e13..bc1976955 100755
--- a/src/tps-bte_0d3v.py
+++ b/src/tps-bte_0d3v.py
@@ -122,7 +122,7 @@ class BoltzmannSolverParams():
     n0            = 3.22e22 #[m^{-3}]
     
     rand_seed       = 0
-    use_clstr_inp   = True
+    use_clstr_inp   = False
     clstr_maxiter   = 10
     clstr_threshold = 1e-3
     
@@ -344,7 +344,7 @@ def grid_setup(self, interface):
         #         active_grid_idx.append(grid_idx)
         
         # self.active_grid_idx  = active_grid_idx #[i for i in range(self.param.n_grids)]
-        self.active_grid_idx  = [i for i in range(self.param.n_grids)]
+        self.active_grid_idx  = [2,3]#[i for i in range(self.param.n_grids)]
         self.sub_clusters_run = False
         return
 
@@ -496,6 +496,29 @@ def fetch(self, interface):
         Ex                      = efield[0]
         Ey                      = efield[1]
         
+        ne                      = species_densities[TPSINDEX.ELE_IDX]
+        coll_list               = self.bte_solver.get_collision_list()
+        coll_names              = self.bte_solver.get_collision_names()
+        cs_data                 = self.bte_solver.get_cross_section_data()
+        
+        cs_species              = list()
+        for col_idx, (k,v) in enumerate(cs_data.items()):
+            cs_species.append(v["species"])
+        
+        cs_species = list(sorted(set(cs_species), key=cs_species.index))
+        data_csv   = np.concatenate([(Ex).reshape((-1, 1)),
+                                     (Ey).reshape((-1, 1)),
+                                     (Tg).reshape((-1, 1)),
+                                     (ne/n0).reshape((-1, 1))] + [ns_by_n0[i].reshape((-1, 1)) for i in range(ns_by_n0.shape[0])] + [n0.reshape(-1, 1)], axis=1)
+        
+        for grid_idx in self.active_grid_idx:
+            with open("%s/%s.csv"%(self.param.output_dir, "tps_fetch_grid_%02d_rank_%02d_npes_%02d"%(grid_idx, self.rankG, self.npesG)), 'w', encoding='UTF8') as f:
+                writer = csv.writer(f,delimiter=',')
+                # write the header
+                header = ["eRe", "eIm", "Tg", "ne/n0"] + ["(%s)/n0"%(s) for s in cs_species] + ["n0"]
+                writer.writerow(header)
+                writer.writerows(data_csv[gidx_to_pidx[grid_idx]])
+                
         EMag                    = np.sqrt(Ex**2 + Ey**2)
         e_idx                   = EMag<self.param.EMag_threshold
         
@@ -618,7 +641,6 @@ def t1():
                     t1()
             
         else:
-            ts = TaskSpace("T")
             for grid_idx in self.active_grid_idx:
                 dev_id            = self.gidx_to_device_map(grid_idx, n_grids)
                 
@@ -1348,7 +1370,7 @@ def asnumpy(a):
         eIm      = asnumpy(self.bte_solver.get_boltzmann_parameter(grid_idx, "eIm"))
         eMag     = np.sqrt(eRe**2 + eIm**2)
         
-        data_csv = np.zeros((ne.shape[0], 7 + ns_by_n0.shape[1] + len((coll_list))))    
+        data_csv = np.zeros((ne.shape[0], 7 + ns_by_n0.shape[1] + len((coll_list)) + 2))    
 
         if export_csv:
             data_csv[: , 0]    = n0
@@ -1357,19 +1379,20 @@ def asnumpy(a):
             data_csv[: ,2:idx] = ns_by_n0[:,:]
             
             data_csv[: , idx]      = Tg
-            data_csv[: , idx+1]    = eMag
-            data_csv[: , idx+2]    = asnumpy(qoi["energy"])
-            data_csv[: , idx+3]    = asnumpy(qoi["mobility"])
-            data_csv[: , idx+4]    = asnumpy(qoi["diffusion"])
+            data_csv[: , idx+1]    = eRe
+            data_csv[: , idx+2]    = eIm
+            data_csv[: , idx+3]    = eMag
+            data_csv[: , idx+4]    = asnumpy(qoi["energy"])
+            data_csv[: , idx+5]    = asnumpy(qoi["mobility"])
+            data_csv[: , idx+6]    = asnumpy(qoi["diffusion"])
                 
             for col_idx, g in enumerate(coll_list):
-                data_csv[: , idx+5 + col_idx]    = asnumpy(qoi["rates"][col_idx])
+                data_csv[: , idx + 7 + col_idx]    = asnumpy(qoi["rates"][col_idx])
                 
-            
             with open("%s_qoi.csv"%(fname), 'w', encoding='UTF8') as f:
                 writer = csv.writer(f,delimiter=',')
                 # write the header
-                header = ["n0", "ne/n0"] + ["(%s)/n0"%(s) for s in cs_species] + ["Tg", "E",  "energy", "mobility", "diffusion"]
+                header = ["n0", "ne/n0"] + ["(%s)/n0"%(s) for s in cs_species] + ["Tg", "eRe", "eIm", "E",  "energy", "mobility", "diffusion"]
                 for col_idx, g in enumerate(coll_list):
                     header.append(str(coll_names[col_idx]))
                 

From 55b8bfe899c742c165650ca1757c7242c4a95c16 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 12 Apr 2024 10:30:35 -0500
Subject: [PATCH 72/75] Update outletBC.cpp

---
 src/outletBC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/outletBC.cpp b/src/outletBC.cpp
index 47299cb65..26293c996 100644
--- a/src/outletBC.cpp
+++ b/src/outletBC.cpp
@@ -75,7 +75,7 @@ OutletBC::OutletBC(MPI_Groups *_groupsMPI, Equations _eqSystem, RiemannSolver *_
   if (eqSystem == NS_PASSIVE) hmeanUp[num_equation_ - 1] = 0.;
 
   if (mixture->GetNumActiveSpecies() > 0) {
-    for (int sp = 0; sp < mixture->GetNumActiveSpecies() > 0; sp++) {
+    for (int sp = 0; sp < mixture->GetNumActiveSpecies(); sp++) {
       hmeanUp[nvel_ + 2 + sp] = 0.0;
     }
   }

From dc6e8dcf245447df082dd1fdf55988f164413b71 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 12 Apr 2024 10:36:12 -0500
Subject: [PATCH 73/75] Update tps.cpp

Formatting
---
 src/tps.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tps.cpp b/src/tps.cpp
index 5f5ab7c9d..1db059897 100644
--- a/src/tps.cpp
+++ b/src/tps.cpp
@@ -139,9 +139,9 @@ void Tps::parseCommandLineArgs(int argc, char *argv[]) {
   args.AddOption(&debugMode, "-d", "--debug", "", "--no-debug", "Launch in debug mode for gdb attach.");
   args.AddOption(&visualMode, "-visual", "--visualization", "", "--no-visualization",
                  "Launch post-process visualization.");
-  gpu_aware_mpi_=false;
+  gpu_aware_mpi_ = false;
   args.AddOption(&gpu_aware_mpi_, "-ga", "--gpu-aware-mpi", "", "--no-gpu-aware-mpi",
-		 "Set GPU-aware MPI."); 
+                 "Set GPU-aware MPI.");
   args.Parse();
 
   if (!args.Good()) {
@@ -204,12 +204,12 @@ void Tps::chooseDevices() {
     if (isRank0_) {
       grvy_printf(GRVY_WARNING, "Cuda-aware MPI detected, but flag is false")
     }
-    gpu_aware_mpi_=true;
+    gpu_aware_mpi_ = true;
   } else if (mpi_gpu_aware == 0 && gpu_aware_mpi_ == true) {
     if (isRank0_) {
       grvy_printf(GRVY_WARNING, "No cuda-aware MPI detected, but flag is true")
     }
-    gpu_aware_mpi_=false;
+    gpu_aware_mpi_ = false;
   }
 
 #endif

From 1030494dfd468b2e73cfe69e85dbe4c573a41335 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 12 Apr 2024 10:51:05 -0500
Subject: [PATCH 74/75] Update tps.cpp

---
 src/tps.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tps.cpp b/src/tps.cpp
index 1db059897..81636fc4a 100644
--- a/src/tps.cpp
+++ b/src/tps.cpp
@@ -194,11 +194,10 @@ void Tps::chooseDevices() {
 #ifdef _GPU_
   device_.Configure(deviceConfig_, rank_ % numGpusPerRank_);
 
-  int mpi_gpu_aware = 0;  // false;
 #if _CUDA_ && defined(MPIX_CUDA_AWARE_SUPPORT)
   // check for cuda-aware mpi (if possible) and overwrite flag if needed
   // Trust the command line flag if  MPIX_Query_cuda_support is not available
-  mpi_gpu_aware = MPIX_Query_cuda_support();
+  int mpi_gpu_aware = MPIX_Query_cuda_support();
 
   if (mpi_gpu_aware == 1 && gpu_aware_mpi_ == false) {
     if (isRank0_) {

From 75938601cfeefcc69145ec77a4de06379eb6eb63 Mon Sep 17 00:00:00 2001
From: Umberto Villa <uvilla@oden.utexas.edu>
Date: Fri, 12 Apr 2024 11:59:20 -0500
Subject: [PATCH 75/75] Update tps.cpp

Addresses compilation error on Lassen
---
 src/tps.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tps.cpp b/src/tps.cpp
index 81636fc4a..b82b7caf3 100644
--- a/src/tps.cpp
+++ b/src/tps.cpp
@@ -201,12 +201,12 @@ void Tps::chooseDevices() {
 
   if (mpi_gpu_aware == 1 && gpu_aware_mpi_ == false) {
     if (isRank0_) {
-      grvy_printf(GRVY_WARNING, "Cuda-aware MPI detected, but flag is false")
+      grvy_printf(GRVY_WARN, "Cuda-aware MPI detected, but flag is false");
     }
     gpu_aware_mpi_ = true;
   } else if (mpi_gpu_aware == 0 && gpu_aware_mpi_ == true) {
     if (isRank0_) {
-      grvy_printf(GRVY_WARNING, "No cuda-aware MPI detected, but flag is true")
+      grvy_printf(GRVY_WARN, "No cuda-aware MPI detected, but flag is true");
     }
     gpu_aware_mpi_ = false;
   }