From b3ec68b57936407e1de7fd3114992c68db0d4817 Mon Sep 17 00:00:00 2001
From: "Rob Knop (Nersc)" <raknop@lbl.gov>
Date: Tue, 22 Oct 2024 09:49:04 -0700
Subject: [PATCH 1/3] Workaround in PYSEx.py to deal with race condition
 writing and reading fits; debug logging.

---
 sfft/utils/SExSkySubtract.py     |  26 ++-
 sfft/utils/pyAstroMatic/PYSEx.py | 340 ++++++++++++++++++-------------
 2 files changed, 223 insertions(+), 143 deletions(-)

diff --git a/sfft/utils/SExSkySubtract.py b/sfft/utils/SExSkySubtract.py
index d1d4939..d3530b2 100644
--- a/sfft/utils/SExSkySubtract.py
+++ b/sfft/utils/SExSkySubtract.py
@@ -6,6 +6,17 @@
 from sfft.utils.pyAstroMatic.PYSEx import PY_SEx
 # version: Apr 22, 2024
 
+import sys
+import logging
+import multiprocessing
+_logger = logging.getLogger(f'sfft')
+if not _logger.hasHandlers():
+    log_out = logging.StreamHandler(sys.stderr)
+    formatter = logging.Formatter(f'[%(asctime)s - sfft - %(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+    log_out.setFormatter(formatter)
+    _logger.addHandler(log_out)
+    _logger.setLevel(logging.DEBUG) # ERROR, WARNING, INFO, or DEBUG (in that order by increasing detail)
+
 # improved by Lauren Aldoroty (Duke Univ.)
 __author__ = "Lei Hu <leihu@andrew.cmu.edu>"
 __version__ = "v1.4"
@@ -72,14 +83,21 @@ def SSS(FITS_obj, FITS_skysub=None, FITS_sky=None, FITS_skyrms=None, FITS_detmas
         # * Generate SExtractor OBJECT-MASK
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
+            _logger.debug( f"Process {multiprocessing.current_process().pid} running PY_SEx.PS on {FITS_obj}..." )
             # NOTE: GAIN, SATURATE, ANALYSIS_THRESH, DEBLEND_MINCONT, BACKPHOTO_TYPE do not affect the detection mask.
-            DETECT_MASK = PY_SEx.PS(FITS_obj=FITS_obj, SExParam=['X_IMAGE', 'Y_IMAGE'], GAIN_KEY='PHGAIN', SATUR_KEY=SATUR_KEY, \
-                BACK_TYPE='AUTO', BACK_SIZE=BACK_SIZE, BACK_FILTERSIZE=BACK_FILTERSIZE, DETECT_THRESH=DETECT_THRESH, \
-                ANALYSIS_THRESH=1.5, DETECT_MINAREA=DETECT_MINAREA, DETECT_MAXAREA=DETECT_MAXAREA, DEBLEND_MINCONT=0.005, \
-                BACKPHOTO_TYPE='GLOBAL', CHECKIMAGE_TYPE='OBJECTS', MDIR=MDIR, VERBOSE_LEVEL=VERBOSE_LEVEL)[1][0].astype(bool)
+            DETECT_MASK = PY_SEx.PS(FITS_obj=FITS_obj, SExParam=['X_IMAGE', 'Y_IMAGE'], GAIN_KEY='PHGAIN', SATUR_KEY=SATUR_KEY,
+                                    BACK_TYPE='AUTO', BACK_SIZE=BACK_SIZE, BACK_FILTERSIZE=BACK_FILTERSIZE,
+                                    DETECT_THRESH=DETECT_THRESH, ANALYSIS_THRESH=1.5, DETECT_MINAREA=DETECT_MINAREA,
+                                    DETECT_MAXAREA=DETECT_MAXAREA, DEBLEND_MINCONT=0.005, BACKPHOTO_TYPE='GLOBAL',
+                                    CHECKIMAGE_TYPE='OBJECTS', MDIR=MDIR, VERBOSE_LEVEL=VERBOSE_LEVEL,
+                                    logger=_logger
+                                    )[1][0].astype(bool)
+            _logger.debug( f"...process {multiprocessing.current_process().pid} done running PY_SEx.PS on {FITS_obj}..." )
 
         # * Extract SExtractor SKY-MAP from the Unmasked Image
+        _logger.debug( f"Running fits.getdata({FITS_obj}, ext=0)..." )
         PixA_obj = fits.getdata(FITS_obj, ext=0).T
+        _logger.debug( f"...done running fits.getdata({FITS_obj}, ext=0)." )
         _PixA = PixA_obj.astype(np.float64, copy=True)    # default copy=True, just to emphasize
         _PixA[DETECT_MASK] = np.nan
         if not _PixA.flags['C_CONTIGUOUS']: _PixA = np.ascontiguousarray(_PixA)
diff --git a/sfft/utils/pyAstroMatic/PYSEx.py b/sfft/utils/pyAstroMatic/PYSEx.py
index c58af2b..875145f 100644
--- a/sfft/utils/pyAstroMatic/PYSEx.py
+++ b/sfft/utils/pyAstroMatic/PYSEx.py
@@ -1,7 +1,12 @@
 import re
+import sys
 import os
+import io
+import time
 import warnings
+import multiprocessing
 import subprocess
+import logging
 import numpy as np
 import os.path as pa
 from astropy.io import fits
@@ -16,6 +21,16 @@
 __author__ = "Lei Hu <leihu@andrew.cmu.edu>"
 __version__ = "v1.4"
 
+_logger = logging.getLogger('PY_SEx')
+if not _logger.hasHandlers():
+    log_out = logging.StreamHandler(sys.stderr)
+    formatter = logging.Formatter(f'[%(asctime)s - PY_SEx - %(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+    log_out.setFormatter(formatter)
+    _logger.addHandler(log_out)
+    _logger.setLevel(logging.DEBUG) # ERROR, WARNING, INFO, or DEBUG (in that order by increasing detail)
+
+
+
 class PY_SEx:
     @staticmethod
     def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_LDAC', \
@@ -25,17 +40,18 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         BACKPHOTO_TYPE='LOCAL', PHOT_APERTURES=5.0, NegativeCorr=True, CHECKIMAGE_TYPE='NONE', \
         VIGNET=None, STAMP_IMGSIZE=None, AddRD=False, ONLY_FLAGS=None, XBoundary=0.0, YBoundary=0.0, \
         Coor4Match='XY_', XY_Quest=None, Match_xytol=2.0, RD_Quest=None, Match_rdtol=1.0, \
-        Preserve_NoMatch=False, MDIR=None, VERBOSE_TYPE='QUIET', VERBOSE_LEVEL=2):
+        Preserve_NoMatch=False, MDIR=None, VERBOSE_TYPE='QUIET', VERBOSE_LEVEL=2,
+        logger=_logger):
 
         """
         # Inputs & Outputs:
 
         -FITS_obj []                    # FITS file path of the input image for photometry
- 
+
         -PSF_obj [None]                 # PSFEx .psf file path for PSF photometry
 
-        -FITS_ref [None]                # FITS file path of the input image for detection 
-                                        # (a) -FITS_ref = None means single image mode:  
+        -FITS_ref [None]                # FITS file path of the input image for detection
+                                        # (a) -FITS_ref = None means single image mode:
                                         #     SEx detection & SEx photometry on same image -FITS_obj.
                                         # (b) -FITS_ref != None mean dual image mode:
                                         #     SEx detection on -FITS_ref & SEx photometry on -FITS_obj
@@ -57,7 +73,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         -PIXEL_SCALE [1.0]              # SExtractor Parameter PIXEL_SCALE
                                         # size of pixel in arcsec (0=use FITS WCS info)
-                                        # P.S. it only works for surface brightness parameters, 
+                                        # P.S. it only works for surface brightness parameters,
                                         #      FWHM_WORLD and star/galaxy separation.
 
         -SEEING_FWHM [1.2]              # SExtractor Parameter SEEING_FWHM
@@ -65,7 +81,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
                                         # P.S. it only works for star/galaxy separation.
 
         -BACK_TYPE ['AUTO']             # SExtractor Parameter BACK_TYPE = [AUTO or MANUAL].
-         
+
         -BACK_VALUE [0.0]               # SExtractor Parameter BACK_VALUE (only work for BACK_TYPE='MANUAL')
 
         -BACK_SIZE [64]                 # SExtractor Parameter BACK_SIZE
@@ -81,12 +97,12 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         -DETECT_MINAREA [5]             # SExtractor Parameter DETECT_MINAREA
                                         # min. # of pixels above threshold
-        
+
         -DETECT_MAXAREA [0]             # SExtractor Parameter DETECT_MAXAREA
                                         # max. # of pixels above threshold (0=unlimited)
 
         -DEBLEND_MINCONT [0.005]        # SExtractor Parameter DEBLEND_MINCONT (typically, 0.001 - 0.005)
-                                        # Minimum contrast parameter for deblending 
+                                        # Minimum contrast parameter for deblending
 
         -CLEAN ['Y]                     # SExtractor Parameter CLEAN
                                         # Clean spurious detections? (Y or N)?
@@ -100,7 +116,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
                                         # P.S. Here it can be a Python list of apertures
 
         -CHECKIMAGE_TYPE ['NONE']       # SExtractor Parameter CHECKIMAGE_TYPE
-                                        # can be NONE, BACKGROUND, BACKGROUND_RMS, MINIBACKGROUND, MINIBACK_RMS, 
+                                        # can be NONE, BACKGROUND, BACKGROUND_RMS, MINIBACKGROUND, MINIBACK_RMS,
                                         # -BACKGROUND, FILTERED, OBJECTS, -OBJECTS, SEGMENTATION, or APERTURES
 
         -VERBOSE_TYPE ['QUIET']         # SExtractor Parameter VERBOSE_TYPE
@@ -108,12 +124,12 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         # Other parameters
 
-        -NegativeCorr [True]            # In SExtractor, MAG_* = 99. and MAGERR_* = 99. for FLUX_* < 0.0 
-                                        # If -NegativeCorr = True, PYSEx will correct MAG_* and MAGERR_* 
+        -NegativeCorr [True]            # In SExtractor, MAG_* = 99. and MAGERR_* = 99. for FLUX_* < 0.0
+                                        # If -NegativeCorr = True, PYSEx will correct MAG_* and MAGERR_*
                                         # to be a valid values using abs(FLUX_*) and FLUXERR_*.
 
         -VIGNET [None]                  # VIGNET for generating PSFEx input catalog
-                                        # e.g., set -VIGNET = (51, 51), PYSEx will add 'VIGNET(51, 51)' into 
+                                        # e.g., set -VIGNET = (51, 51), PYSEx will add 'VIGNET(51, 51)' into
                                         #       the SExtractor output parameter list.
 
         -STAMP_IMGSIZE [None]            # PYSEx allows for making a stamp for each detected source on -FITS_obj,
@@ -122,14 +138,14 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         -AddRD [False]                  # Add columns for Ra. and Decl. in the output catalog?
                                         # P.S. The columns are X_WORLD, Y_WORLD or XWIN_WORLD, YWIN_WORLD.
-                                        #      Although SExtractor itself can generate these columns, here we use 
+                                        #      Although SExtractor itself can generate these columns, here we use
                                         #      astropy.wcs to convert image coordinates to world coordinates instead.
                                         #      (Because I feel like that astropy has better WCS compatibility than SExtractor)
 
         -ONLY_FLAGS [None]              # Do you put any constrain on the SExtractor output parameter FLAGS
                                         #
                                         # FLAGS description
-                                        # 1   aperture photometry is likely to be biased by neighboring sources 
+                                        # 1   aperture photometry is likely to be biased by neighboring sources
                                         #     or by more than 10% of bad pixels in any aperture
                                         # 2	  the object has been deblended
                                         # 4	  at least one object pixel is saturated
@@ -164,7 +180,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         -Preserve_NoMatch [False]       # Preserve the detected sources in SExtractor photometry catalog without cross match counterpart
 
         -MDIR [None]                    # Parent Directory for output files
-                                        # PYSEx will generate a child directory with a random name under the paraent directory 
+                                        # PYSEx will generate a child directory with a random name under the paraent directory
                                         # all output files are stored in the child directory
 
         -VERBOSE_LEVEL [2]              # The level of verbosity, can be [0, 1, 2]
@@ -177,34 +193,34 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
             PixA_SExCheckLst            # List of Pixel arrays for SExtractor check images
                                         # P.S. PixA = fits.getdata(FITS, ext=0).T
-            
+
             FITS_SExCat                 # File path of SExtractor photometry catalog
                                         # P.S. only for -MDIR is not None
 
             FITS_SExCheckLst            # List of file path of SExtractor check images
                                         # P.S. only for -MDIR is not None
-        
-        # ---------------- MORE DESCRIPTION ON HOW SEXTRACTOR WORK ---------------- 
+
+        # ---------------- MORE DESCRIPTION ON HOW SEXTRACTOR WORK ----------------
         #
         # * SExtractor Inputs
         #    ** Array-Inputs:
-        #       SEx works on one image for signal detection and another image for photometry 
+        #       SEx works on one image for signal detection and another image for photometry
         #       @ Individual Mode (Common): Image4detect and Image4phot are the same image (-FITS_obj).
         #       @ Dual Mode: Image4detect (-FITS_ref) and Image4phot (-FITS_obj) are different images .
         #    ** PSF-Input:
         #       SEx can accept given PSF model for PSF-Photometry (-PSF_obj).
-        #    ** Parameter-Inputs: 
+        #    ** Parameter-Inputs:
         #       a. Basic keywords in FITS header of Image4detect:
         #          (-GAIN_KEY, -SATUR_KEY).
         #       b. How to generate Global Background Map:
         #          (-BACK_TYPE, -BACK_VALUE, -BACK_SIZE, -BACK_FILTERSIZE).
         #       c. Give the criteria for SExtractor Source Detection
         #          (-DETECT_THRESH, -DETECT_MINAREA, -DETECT_MAXAREA, -DEBLEND_NTHRESH, -DEBLEND_MINCONT, -CLEAN).
-        #       d. Which photometry method(s) used by SExtractor: 
+        #       d. Which photometry method(s) used by SExtractor:
         #          (parameters in -SExParam, e.g., FLUX_AUTO, FLUX_APER, FLUX_PSF).
-        #       e. Specify output Check-Images: 
+        #       e. Specify output Check-Images:
         #          (-CHECKIMAGE_TYPE).
-        #       f. Specify output columns in output SExtractor photometry table: 
+        #       f. Specify output columns in output SExtractor photometry table:
         #          (-SExParam).
         #
         #    Remarks on Weight-Map:
@@ -212,50 +228,50 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #       vairable effective GAIN and background noise level across the field due to different Num_Exposure.
         #       Weight-map would help SExtractor to calculate errors, such as, FLUXERR and MAGERR, more accurately.
         #
-        #       WARNING: For current version, PYSEx does not include this feature, and I would recommend users  
-        #                to set an average effective GAIN for mosaic image, and keep in mind it may, to some extent, 
+        #       WARNING: For current version, PYSEx does not include this feature, and I would recommend users
+        #                to set an average effective GAIN for mosaic image, and keep in mind it may, to some extent,
         #                cause inaccurate error estimations.
         #
         # * SExtractor Workflow (Background)
         #    ** Extract Global_Background_Map (GBMap) and its RMS (GBRMap) from Image4detect & Image4phot
         #       Control Parameters: BACK_TYPE, BACK_VALUE, BACK_SIZE and BACK_FILTERSIZE.
-        #       @ Produce GBMap 
+        #       @ Produce GBMap
         #         a. Manual-FLAT (e.g. BACK_TYPE='MANUAL', BACK_VALUE=100.0)
         #            SEx directly define GBMap as a FLAT image with given constant BACK_VALUE.
         #         b. Auto (e.g. BACK_TYPE='AUTO', BACK_SIZE=64, BACK_FILTERSIZE=3)
         #            SEx defines a mesh of a grid that covers the whole frame by [BACK_SIZE].
         #            i. Convergence-based sigma-clipping method on the flux histogram of each tile.
-        #               More specificly, the flux histogram is clipped iteratively until convergence 
+        #               More specificly, the flux histogram is clipped iteratively until convergence
         #               at +/- 3sigma around its median.
         #            ii. SEx compute local background estimator from the clipped histogram of each tile.
-        #                If sigma is changed by less than 20% during clipping process (the tile is not crowded), 
+        #                If sigma is changed by less than 20% during clipping process (the tile is not crowded),
         #                use the mean of the clipped histogram as estimator
         #                otherwise (the tile is crowded), mode = 2.5*median - 1.5*mean is employed instead.
-        #            iii. Once the estimate grid is calculated, a median filter [BACK_FILTERSIZE] can be applied 
+        #            iii. Once the estimate grid is calculated, a median filter [BACK_FILTERSIZE] can be applied
         #                 to suppress possible local overestimations.
-        #            iv. The resulting background map is them simply a bicubic-spline interpolation 
+        #            iv. The resulting background map is them simply a bicubic-spline interpolation
         #                between the meshes of the grid.
         #
         #        @ Generate GBRMap
         #          only Auto (e.g, BACK_SIZE=64, BACK_FILTERSIZE=3)
-        #          SEx produces the noise map by the same approach of Auto style of GBMap, where the only 
-        #          difference is that SEx is [probably] use standard deviation as estimator of the 
+        #          SEx produces the noise map by the same approach of Auto style of GBMap, where the only
+        #          difference is that SEx is [probably] use standard deviation as estimator of the
         #          clipped flux historgram, other than mean or mode.
-        #        
+        #
         #        NOTE Abbr. GBMap / GBRMap from Image4detect: GBMap_4d / GBRMap_4d
         #             Abbr. GBMap / GBRMap from Image4phot: GBMap_4p / GBRMap_4p
         #
-        #        NOTE WARNING: Dual Mode have to use consistent control parameters for Image4detect & Image4phot, 
-        #                      as it is not allowed to set some secondary configuration in SEx software framework, 
+        #        NOTE WARNING: Dual Mode have to use consistent control parameters for Image4detect & Image4phot,
+        #                      as it is not allowed to set some secondary configuration in SEx software framework,
         #                      despite that it  is not necessarily reasonable in some cases.
         #
         # * SExtractor Workflow (Detection)
         #    ** a. SEx-Detect on Image4detect: SkySubtraction & Filtering & Thresholding & Deblending & AreaConstrain & Clean
         #          @ SkySubtraction & Filtering Process (e.g. FILTER='Y', FILTER_NAME='default.conv')
-        #            SEx Remove GBMap_4d from Image4detect and then perform a convolution to maximizes detectability. 
+        #            SEx Remove GBMap_4d from Image4detect and then perform a convolution to maximizes detectability.
         #            NOTE The power-spectrum of the noise and that of the superimposed signal can be significantly different.
         #            NOTE Although Filtering is a benefit for detection, it distorts profiles and correlates the noise.
-        #            NOTE Filtering is applied 'on the fly' to the image, and directly affects only the following 
+        #            NOTE Filtering is applied 'on the fly' to the image, and directly affects only the following
         #                 Thresholding process and Isophotal parameters.
         #
         #          @ Thresholding Process (e.g. DETECT_THRESH=1.5, THRESH_TYPE='RELATIVE')
@@ -263,20 +279,20 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #            We would be bettter to imagine SEx actually make a hovering mask.
         #
         #          @ Deblending Process (e.g. DEBLEND_MINCONT=0.005, DEBLEND_NTHRESH=32)
-        #            SEx triggers a deblending process to dentify signal islands from the hovering 
+        #            SEx triggers a deblending process to dentify signal islands from the hovering
         #            mask [probably] on Filtered_Image,
         #            which converts the hovering mask to be a hovering label map.
         #
         #          @ Put AreaConstrain (e.g. DETECT_MINAREA=5, DETECT_MAXAREA=0)
-        #            MinMax AreaContrain is applied and then iolated cases then lose their hovering labels. 
+        #            MinMax AreaContrain is applied and then iolated cases then lose their hovering labels.
         #
         #          @ Clean Process (e.g. CLEAN='YES')
         #            SEx will clean the list of objects of artifacts caused by bright objects.
         #            As a correction process, all cleaned objects are subsequently removed from the hovering label map.
-        #            NOTE Now the hovering label map is the SEGMENTATION check image. 
+        #            NOTE Now the hovering label map is the SEGMENTATION check image.
         #                 One may refer to such label island as ISOIsland
         #            NOTE These labels are consistent with the indices in the ouput photometry table.
-        #            NOTE SEx will report something like this: Objects: detected 514 / sextracted 397 
+        #            NOTE SEx will report something like this: Objects: detected 514 / sextracted 397
         #                 SET CLEAN='N', you could find detected == sextracted.
         #
         #    ** b. Generate Positional & BasicShape Paramters from isophotal profile
@@ -289,7 +305,7 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #               (if a parameter can be fully expressed by them, we will use $ to indicate).
         #          iv. Describe ISOIsand as an elliptical shape, centred at Barycenter.
         #               $A_IMAGE, $B_IMAGE are ellipse semi-major and semi-minor axis lengths, respectively.
-        #               The ellipse is uniquely determined by $CXX_IMAGE, $CYY_IMAGE, $CXY_IMAGE with KRON_RADIUS, 
+        #               The ellipse is uniquely determined by $CXX_IMAGE, $CYY_IMAGE, $CXY_IMAGE with KRON_RADIUS,
         #               where KRON_RADIUS is independently calculated in a routine inspired by Kron's 'first moment' algorithm.
         #               NOTE By-products: $ELONGATION = A / B and $ELLIPTICITY = 1 - B / A
         #
@@ -300,19 +316,19 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #          METHOD: The computations involved are roughly the same except that the domain is a circular Gaussian window,
         #                  (I don't know what is the window radius) as opposed to the object's isophotal footprint (ISOIsland).
         #
-        #          MOTIVATION: Parameters measured within an object's isophotal limit are sensitive to two main factors: 
-        #                      + Changes in the detection threshold, which create a variable bias 
-        #                      + Irregularities in the object's isophotal boundaries, which act as 
+        #          MOTIVATION: Parameters measured within an object's isophotal limit are sensitive to two main factors:
+        #                      + Changes in the detection threshold, which create a variable bias
+        #                      + Irregularities in the object's isophotal boundaries, which act as
         #                        additional 'noise' in the measurements.
         #
-        #          @Positional: This is an iterative process. The computation starts by initializing 
+        #          @Positional: This is an iterative process. The computation starts by initializing
         #                       the windowed centroid coordinates to the Barycenter.
-        #                       The process will adjust window and finally its centroid converges 
+        #                       The process will adjust window and finally its centroid converges
         #                       at some point: XWIN_IMAGE, YWIN_IMAGE.
         #                       (If the process is failed then XWIN_IMAGE, YWIN_IMAGE = X_IMAGE, Y_IMAGE)
-        #                       It has been verified that for isolated, Gaussian-like PSFs, its accuracy is close to 
-        #                       the theoretical limit set by image noise. 
-        #                       NOTE: We preferably use it for point sources, like in transient detection. 
+        #                       It has been verified that for isolated, Gaussian-like PSFs, its accuracy is close to
+        #                       the theoretical limit set by image noise.
+        #                       NOTE: We preferably use it for point sources, like in transient detection.
         #                             However it may not optimal for extended sources like glaxies.
         #                       NOTE: X_IMAGE & Y_IMAGE seems to be a good compromise choice.
         #
@@ -320,9 +336,9 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #                       X2WIN_IMAGE, Y2WIN_IMAGE, XY2WIN_IMAGE
         #                       AWIN_IMAGE, BWIN_IMAGE, CXXWIN_IMAGE, CYYWIN_IMAGE, CXYWIN_IMAGE
         #
-        #          NOTE: Positional XWIN_IMAGE and YWIN_IMAGE are quite useful to 
-        #                provide a refined Object coordinate (of gaussian-like point sources). 
-        #                However we seldom use Window version of BasicShape parameters 
+        #          NOTE: Positional XWIN_IMAGE and YWIN_IMAGE are quite useful to
+        #                provide a refined Object coordinate (of gaussian-like point sources).
+        #                However we seldom use Window version of BasicShape parameters
         #                to describe the shape of the approaximated ellipse.
         #
         #    ** d. Generate Positional Paramters from PSF Fitting
@@ -334,13 +350,13 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #       NOTE This process always works on Image4phot.
         #       @ISO: SEx simply count flux according to the hovering label map (ISOIsland).
         #       @APER: SEx count flux on a Circular Aperture with given PHOT_APERTURES (centred at Barycenter X_IMAGE, Y_IMAGE).
-        #       @AUTO: SEx count flux on a Elliptic Aperture, which is determined by the hovering isophotal ellipse 
+        #       @AUTO: SEx count flux on a Elliptic Aperture, which is determined by the hovering isophotal ellipse
         #              (centred at Barycenter X_IMAGE, Y_IMAGE). The leaked light fraction is typically less than 10%.
         #       @PSF: SEx count flux according to the PSF fitting results (centred at XPSF_IMAGE and YPSF_IMAGE).
         #             This is optimal for pointsource but fairly wrong for extended objects.
         #
         #    ** Peel background contribution from Counted Flux
-        #       @BACKPHOTO_TYPE='LOCAL': background will take a rectangular annulus into account, 
+        #       @BACKPHOTO_TYPE='LOCAL': background will take a rectangular annulus into account,
         #                                which has a donnut shape around the object, measured on Image4phot.
         #       @BACKPHOTO_TYPE='GLOBAL': background will directly use GBMap_4p,
         #                                 which can be Manual-Flat or Auto.
@@ -356,54 +372,54 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #    @BACKGROUND : GBMap_4p
         #    @BACKGROUND_RMS : GBRMap_4d
         #    @-BACKGROUND : Image4phot - GBMap_4p
-        #    @FILTERED: Filtered_Image 
+        #    @FILTERED: Filtered_Image
         #    @SEGMENTATION: ISOIsland Label Map
         #    @OBJECTS: ISOIslands use flux in -BACKGROUND, zero otherwise
-        #    @APERTURES: -BACKGROUND with brighten apertures to show the isophotal ellipses. 
+        #    @APERTURES: -BACKGROUND with brighten apertures to show the isophotal ellipses.
         #
         # * SExtractor Workflow (Formula)
         #    a. MAG = -2.5 * log10(FLUX) + MAG_ZEROPOINT
-        #    b. MAGERR = 1.0857 * FLUXERR / FLUX = 1.0857 / SNR 
+        #    b. MAGERR = 1.0857 * FLUXERR / FLUX = 1.0857 / SNR
         #       NOTE: a common crude approxmation SNR ~ 1/MAGERR.
-        #       NOTE: It is [probably] derived from MAGERR = 2.5*np.log10(1.0+1.0/SNR) 
+        #       NOTE: It is [probably] derived from MAGERR = 2.5*np.log10(1.0+1.0/SNR)
         #                   Ref: http://www.ucolick.org/~bolte/AY257/s_n.pdf
         #                   ZTF use a more precise factor 1.085736, whatever, not a big deal.
         #
         # * Additional Clarification
         #    a. SExtractor can read GAIN & SATURATION & MAGZERO from FITS header of Image4phot by their keys.
-        #    b. If SExtractor is configured with FITS_LDAC, the FITS header of Image4phot will be delivered 
-        #       into output FITS file saved at output-FITS[1].data in table format 
+        #    b. If SExtractor is configured with FITS_LDAC, the FITS header of Image4phot will be delivered
+        #       into output FITS file saved at output-FITS[1].data in table format
         #       (only one element: a long integrated header text).
-        #    c. If some ISOIsland has saturated pixel value on Image4phot, you can still find it 
+        #    c. If some ISOIsland has saturated pixel value on Image4phot, you can still find it
         #       on SEGMENTATION / OBJECTS, and it will be marked by FLAGS=4 in output catalog.
         #    d. Windowed Coordinate has higher priority in the function
         #       i. Make Stamps ii. Convert to RD iii. Symmetric-Match
-        #       First use XWIN_IMAGE & YWIN_IMAGE (if exist), 
+        #       First use XWIN_IMAGE & YWIN_IMAGE (if exist),
         #       otherwise, employ X_IMAGE & Y_IMAGE instead.
-        #    e. SExtractor allows to submit request for multi-check images 
+        #    e. SExtractor allows to submit request for multi-check images
         #       e.g. CHECKIMAGE_TYPE = "BACKGROUND,SEGMENTATION,..."
         #    f. Although SExtractor can directly provide sky coordinates in output table,
-        #       We always independently to get them by convert XY using astropy. 
-        #    g. If VIGNET is called in SExtractor, stamps will be extracted around the targets, 
+        #       We always independently to get them by convert XY using astropy.
+        #    g. If VIGNET is called in SExtractor, stamps will be extracted around the targets,
         #       centred at their Barycenter X_IMAGE, Y_IMAGE, from Image4phot.
         #
         # * Additional Tips
         #    a. SNR_WIN: Window-based Gaussian-weighted SNR estimate
-        #       Although SNR_WIN is empirically tend to slightly underestimate noise, 
+        #       Although SNR_WIN is empirically tend to slightly underestimate noise,
         #       this useful parameter is calculated independently from the employed phot-method.
         #
-        #    b. If you got a long runtime, it may caused by 
-        #       i. Low DETECT_THRESH 
+        #    b. If you got a long runtime, it may caused by
+        #       i. Low DETECT_THRESH
         #       ii. Request XWIN_IMAGE, YWIN_IMAGE, SNR_WIN
         #       iii. BACKPHOTO_TYPE == 'LOCAL'
         #
         #    c. WARNINGS
-        #       i. You may encounter bug if sethead after fits.getdata 
+        #       i. You may encounter bug if sethead after fits.getdata
         #       ii. SExtractor do not support string > 256 as argument in command line.
         #           Use 'cd dir && sex ...' to avoid segmentation fault.
         #       iii. FITS_LDAC --- TABLE-HDU 2    |   FITS_1.0 ---  TABLE-HDU 1
         #       iv. In Debian operation system, please correct 'sex' as 'sextractor'
-        # 
+        #
         #    d. A coarse look-up table between DETECT_THRESH and MINIMAL SNR_WIN
         #       DETECT_THRESH = 1.0 ---> minimal SNR_WIN = 3-4
         #       DETECT_THRESH = 1.2 ---> minimal SNR_WIN = 4-5
@@ -419,46 +435,49 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         #      ** When this mode is called, please read above comments very carefully.
         #
         #    @ Single-Image Mode: FITS_obj is Image4detect & Image4phot.
-        #      ** When sky background has been well subtracted. 
+        #      ** When sky background has been well subtracted.
         #         Typically Set: BACK_TYPE='MANUAL', BACK_VALUE=0.0, BACK_SIZE=64, BACK_FILTERSIZE=3, BACKPHOTO_TYPE='LOCAL'
         #         a. BACK_TYPE & BACK_VALUE: Use Flat-Zero as Global_Background_Map
         #         b. BACK_SIZE & BACK_FILTERSIZE: Produce RMS of Global_Background_Map [AUTO]
         #         c. BACKPHOTO_TYPE: Use LOCAL / GLOBAL (zero) background to count sky flux contribution in photometry.
         #
         # * Additional Remarks on Background
-        #   @ well-subtracted sky just means it probably outperforms SEx GLOBAL-Sky, 
-        #     we should not presume the underlying true sky is really subtracted, therefore, 
+        #   @ well-subtracted sky just means it probably outperforms SEx GLOBAL-Sky,
+        #     we should not presume the underlying true sky is really subtracted, therefore,
         #     BACKPHOTO_TYPE = 'LOCAL' is in general necessary !
         #
-        #   @ Just like sky subtraction, the sky term in image subtraction can only handle the 
-        #     low-spatial-frequency trend of background. The image has flat zero background as an ideal expectation, 
+        #   @ Just like sky subtraction, the sky term in image subtraction can only handle the
+        #     low-spatial-frequency trend of background. The image has flat zero background as an ideal expectation,
         #     we still need set BACKPHOTO_TYPE = 'LOCAL' on difference photometry.
         #
-        #   @ 'LOCAL' method itself can be biased too. We get the conclusion when we try to perform 
-        #     aperture photometry on two psf-homogenized DECam images. Recall any error on matched kernel is some 
-        #     linear effect for aperture photometry, however, we found a bias in FLUX_APER which is independent 
-        #     with the target birghtness. E.g. FLUX_APER_SCI is always smaller than FLUX_APER_REF with 
-        #     a nearly constant value, say 10.0 ADU, no matter the target is 19.0 mag or 22.0 mag, as if there is 
-        #     some constant leaked light when we measure on SCI. Equivalently, DMAG = MAG_APER_SCI - MAG_APER_REF 
+        #   @ 'LOCAL' method itself can be biased too. We get the conclusion when we try to perform
+        #     aperture photometry on two psf-homogenized DECam images. Recall any error on matched kernel is some
+        #     linear effect for aperture photometry, however, we found a bias in FLUX_APER which is independent
+        #     with the target birghtness. E.g. FLUX_APER_SCI is always smaller than FLUX_APER_REF with
+        #     a nearly constant value, say 10.0 ADU, no matter the target is 19.0 mag or 22.0 mag, as if there is
+        #     some constant leaked light when we measure on SCI. Equivalently, DMAG = MAG_APER_SCI - MAG_APER_REF
         #     deviate zero-baseline, and it becomes increasingly serious (towards to the faint end).
         #
-        #     ------------------------------------ 
-        #     We guess the problem is caused by the fact: the background value calculated from the annulus 
-        #     around target might be a biased (over/under-) estimation. One observation supports our argument: 
+        #     ------------------------------------
+        #     We guess the problem is caused by the fact: the background value calculated from the annulus
+        #     around target might be a biased (over/under-) estimation. One observation supports our argument:
         #     the flux bias is much more evident when we increase the aperture size.
-        #     NOTE: As we have found the flux bias is basically a constant, we can do relative-calibration by 
-        #           calculating the compensation offset FLUX_APER_REF - FLUX_APER_SCI for a collection of sationary stars. 
-        #           It is 'relative' since we have just assumed background estimation of REF is correct, 
+        #     NOTE: As we have found the flux bias is basically a constant, we can do relative-calibration by
+        #           calculating the compensation offset FLUX_APER_REF - FLUX_APER_SCI for a collection of sationary stars.
+        #           It is 'relative' since we have just assumed background estimation of REF is correct,
         #           which is proper if we are going to derive the variability (light curve).
         #
         #   @ In which cases, Re-Run SExtractor can get the same coordinate list?
-        #     a. Same configurations but only change photometric method, e.g. from AUTO to APER 
+        #     a. Same configurations but only change photometric method, e.g. from AUTO to APER
         #     b. Same configurations but from Single-Image Mode to Dual-Image Mode
         #     NOTE: FLAGS is correlated to object image, if we add constraint FLAG=0
         #           then we fail to get the same coordinate list.
         #
         """
 
+        pid = multiprocessing.current_process().pid
+        logger.debug( f"PY_SEx.PS process {pid} starting {FITS_obj}" )
+
         # * sex or sextractor?
         for cmd in ['sex', 'sextractor']:
             try:
@@ -479,34 +498,60 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
             _message = 'Run Python Wrapper of SExtractor!'
             print('\nMeLOn CheckPoint [%s]: %s' %(objname, _message))
 
-        # * Keyword Configurations 
-        phr_obj = fits.getheader(FITS_obj, ext=0)       
+        # * Keyword Configurations
+        logger.debug( f"PY_SEx process {pid} reading header from {FITS_obj}..." )
+        retries = 5
+        while retries > 0:
+            # NOTE -- we were getting weird race condition issues where sometimes
+            #   the FITS header reading failed.  We aren't sure exactly what's going
+            #   on, but the hypothesis is bad filesystem behavior.  The earlier
+            #   writefits returns, but somehow the file wasn't really fully written
+            #   to either disk, or to the memory buffers that the fits.getheader
+            #   here read.  Put in an ugly hack of retring and sleeping to work
+            #   around this.
+            try:
+                phr_obj = fits.getheader(FITS_obj, ext=0)
+                retries = 0
+            except Exception as ex:
+                retries -= 1
+                strio = io.StringIO()
+                strio.write( f"PY_SEx process {pid} got exception trying to read fits header: {ex}; " )
+                if retries > 0:
+                    strio.write( f"waiting 1s and trying again ({retries} tries left)" )
+                    logger.warning( strio.getvalue() )
+                    time.sleep( 1 )
+                else:
+                    strio.write( "repeatedly failed, giving up" )
+                    logger.error( strio.getvalue() )
+                    raise RuntimeError( f"Failed to read header from {FITS_obj}" )
+
+        logger.debug( f"... DONE: PY_SEx process {pid} reading header from {FITS_obj}" )
         if GAIN_KEY in phr_obj:
             GAIN = phr_obj[GAIN_KEY]
             if VERBOSE_LEVEL in [1, 2]:
                 _message = 'SExtractor uses GAIN = [%s] from keyword [%s]!' %(GAIN, GAIN_KEY)
                 print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
-        else: 
+        else:
             GAIN = 0.0  # infinite GAIN, Poission noise ignored
             if VERBOSE_LEVEL in [0, 1, 2]:
                 _warn_message = 'SExtractor has to use default GAIN = 0!'
                 warnings.warn('MeLOn WARNING [%s]: %s' %(objname, _warn_message))
-        
+
         if SATUR_KEY in phr_obj:
             SATURATION = phr_obj[SATUR_KEY]
             if VERBOSE_LEVEL in [1, 2]:
                 _message = 'SExtractor uses SATURATION = [%s] from keyword [%s]!' %(SATURATION, SATUR_KEY)
                 print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
-        else: 
+        else:
             SATURATION = 50000.0
             if VERBOSE_LEVEL in [0, 1, 2]:
                 _warn_message = 'SExtractor has to use default SATURATION = 50000.0!'
                 warnings.warn('MeLOn WARNING [%s]: %s' %(objname, _warn_message))
-        
+
         """
         # A few additional remarks
         # [1] MAGERR/FLUXERR/SNR are very sensitive to GAIN value.
-        # [2] PIXEL_SCALE (unit: arcsec) only works for surface brightness parameters, 
+        # [2] PIXEL_SCALE (unit: arcsec) only works for surface brightness parameters,
         #     FWHM (FWHM_WORLD) and star/galaxy separation. PIXEL_SCALE=0 uses FITS WCS info.
         # [3] SEEING_FWHM (unit: arcsec) is only for star/galaxy separation.
         #     VIP: You'd better to give a good estimate if star/galaxy separation is needed.
@@ -522,10 +567,10 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         ConfigDict['GAIN_KEY'] = '%s' %GAIN_KEY
         ConfigDict['SATUR_KEY'] = '%s' %SATUR_KEY
-        ConfigDict['MAG_ZEROPOINT'] = '0.0'        
+        ConfigDict['MAG_ZEROPOINT'] = '0.0'
         ConfigDict['PIXEL_SCALE'] = '%s' %PIXEL_SCALE
         ConfigDict['SEEING_FWHM'] = '%s' %SEEING_FWHM
-        
+
         ConfigDict['BACK_TYPE'] = '%s' %BACK_TYPE
         ConfigDict['BACK_VALUE'] = '%s' %BACK_VALUE
         ConfigDict['BACK_SIZE'] = '%s' %BACK_SIZE
@@ -538,14 +583,14 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         ConfigDict['DEBLEND_NTHRESH'] = '%s' %DEBLEND_NTHRESH
         ConfigDict['DEBLEND_MINCONT'] = '%s' %DEBLEND_MINCONT
         ConfigDict['CLEAN'] = '%s' %CLEAN
-        
+
         ConfigDict['CHECKIMAGE_TYPE'] = '%s' %CHECKIMAGE_TYPE
         ConfigDict['BACKPHOTO_TYPE'] = '%s' %BACKPHOTO_TYPE
         if not isinstance(PHOT_APERTURES, (int, float)):
             ConfigDict['PHOT_APERTURES'] = '%s' %(','.join(np.array(PHOT_APERTURES).astype(str)))
         else: ConfigDict['PHOT_APERTURES'] = '%s' %PHOT_APERTURES
         if PSF_obj is not None: ConfigDict['PSF_NAME'] = '%s' %PSF_obj
-        
+
         # create configuration file .conv
         if USE_FILT:
             # see https://github.com/astromatic/sextractor/blob/master/config/default.conv
@@ -558,11 +603,11 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
             _cfile = open(conv_path, 'w')
             _cfile.write(conv_text)
             _cfile.close()
-        
+
         USE_NNW = False
-        if 'CLASS_STAR' in SExParam: 
+        if 'CLASS_STAR' in SExParam:
             USE_NNW = True
-        
+
         if USE_NNW:
             # see https://github.com/astromatic/sextractor/blob/master/config/default.nnw
             nnw_text = r"""
@@ -572,12 +617,12 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
             # outputs:	``Stellarity index'' (0.0 to 1.0)
             # Seeing FWHM range: from 0.025 to 5.5'' (images must have 1.5 < FWHM < 5 pixels)
             # Optimized for Moffat profiles with 2<= beta <= 4.
-            
+
              3 10 10  1
-            
+
             -1.56604e+00 -2.48265e+00 -1.44564e+00 -1.24675e+00 -9.44913e-01 -5.22453e-01  4.61342e-02  8.31957e-01  2.15505e+00  2.64769e-01
              3.03477e+00  2.69561e+00  3.16188e+00  3.34497e+00  3.51885e+00  3.65570e+00  3.74856e+00  3.84541e+00  4.22811e+00  3.27734e+00
-            
+
             -3.22480e-01 -2.12804e+00  6.50750e-01 -1.11242e+00 -1.40683e+00 -1.55944e+00 -1.84558e+00 -1.18946e-01  5.52395e-01 -4.36564e-01 -5.30052e+00
              4.62594e-01 -3.29127e+00  1.10950e+00 -6.01857e-01  1.29492e-01  1.42290e+00  2.90741e+00  2.44058e+00 -9.19118e-01  8.42851e-01 -4.69824e+00
             -2.57424e+00  8.96469e-01  8.34775e-01  2.18845e+00  2.46526e+00  8.60878e-02 -6.88080e-01 -1.33623e-02  9.30403e-02  1.64942e+00 -1.01231e+00
@@ -588,11 +633,11 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
              3.75075e+00  7.25399e+00 -1.75325e+00 -2.68814e+00 -3.71128e+00 -4.62933e+00 -2.13747e+00 -1.89186e-01  1.29122e+00 -7.49380e-01  6.71712e-01
             -8.41923e-01  4.64997e+00  5.65808e-01 -3.08277e-01 -1.01687e+00  1.73127e-01 -8.92130e-01  1.89044e+00 -2.75543e-01 -7.72828e-01  5.36745e-01
             -3.65598e+00  7.56997e+00 -3.76373e+00 -1.74542e+00 -1.37540e-01 -5.55400e-01 -1.59195e-01  1.27910e-01  1.91906e+00  1.42119e+00 -4.35502e+00
-            
+
             -1.70059e+00 -3.65695e+00  1.22367e+00 -5.74367e-01 -3.29571e+00  2.46316e+00  5.22353e+00  2.42038e+00  1.22919e+00 -9.22250e-01 -2.32028e+00
-            
-            
-             0.00000e+00 
+
+
+             0.00000e+00
              1.00000e+00 """
 
             nintent = len(re.split('NNW', re.split('\n', nnw_text)[1])[0])
@@ -605,9 +650,9 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
         # create configuration file .param
         USExParam = SExParam.copy()
         if SExParam is None: USExParam = []
-        if 'X_IMAGE' not in USExParam: 
+        if 'X_IMAGE' not in USExParam:
             USExParam.append('X_IMAGE')
-        if 'Y_IMAGE' not in USExParam: 
+        if 'Y_IMAGE' not in USExParam:
             USExParam.append('Y_IMAGE')
 
         Param_path = ''.join([TDIR, "/PYSEx.param"])
@@ -630,19 +675,36 @@ def PS(FITS_obj, PSF_obj=None, FITS_ref=None, SExParam=None, CATALOG_TYPE='FITS_
 
         sex_config_path = AMConfig_Maker.AMCM(MDIR=TDIR, AstroMatic_KEY=SEx_KEY, \
             ConfigDict=ConfigDict, tag='PYSEx')
-            
+
         # * Trigger SExtractor
         if FITS_ref is None:
-            os.system("cd %s && %s %s -c %s -CATALOG_NAME %s -CHECKIMAGE_NAME %s" \
-                %(pa.dirname(FITS_obj), SEx_KEY, FNAME, sex_config_path, \
-                  FITS_SExCat, ','.join(FITS_SExCheckLst)))
-        
+            cmd_to_run = [ SEx_KEY, FNAME, '-c', sex_config_path,
+                           '-CATALOG_NAME', FITS_SExCat,
+                           '-CHECKIMAGE_NAME', ','.join(FITS_SExCheckLst) ]
+            logger.debug( f"PY_SEx process {pid} running: {cmd_to_run}..." )
+            res = subprocess.run( cmd_to_run,
+                                  cwd=pa.dirname(FITS_obj),
+                                  capture_output=True )
+            if res.returncode != 0:
+                barf = ( f"*** PY_SEx.PS process {pid} {SEx_KEY} subprocess return code {res.returncode}\n"
+                         f"*** stdout ***\n{res.stdout}\n*** stderr ***\n{res.stderr}\n" )
+                logger.error( barf )
+                raise RuntimeError( f"{SEx_KEY} failed: {barf}" )
+            logger.debug( f"...DONE: PY_SEx process {pid} running: {cmd_to_run}" )
+
+            # os.system("cd %s && %s %s -c %s -CATALOG_NAME %s -CHECKIMAGE_NAME %s" \
+            #     %(pa.dirname(FITS_obj), SEx_KEY, FNAME, sex_config_path, \
+            #       FITS_SExCat, ','.join(FITS_SExCheckLst)))
+
         if FITS_ref is not None:
             # WARNING: more risky to fail due to the too long string.
+
+            raise RuntimeError( "FITS_ref was not None, fix the code." )
+
             os.system("%s %s,%s -c %s -CATALOG_NAME %s -CHECKIMAGE_NAME %s" \
                 %(SEx_KEY, FITS_ref, FITS_obj, sex_config_path, \
-                  FITS_SExCat, ','.join(FITS_SExCheckLst)))    
-        
+                  FITS_SExCat, ','.join(FITS_SExCheckLst)))
+
         """
         # Deprecated as it requires WCSTools, and seems not very useful
         def record(FITS):
@@ -653,7 +715,7 @@ def record(FITS):
                 pack = ' : '.join(['Sex Parameters', key, value])
                 os.system('sethead %s HISTORY="%s"' %(FITS, pack))
             return None
-        
+
         """
 
         if CATALOG_TYPE == 'FITS_LDAC': tbhdu = 2
@@ -662,7 +724,7 @@ def record(FITS):
             #if MDIR is not None: record(FITS_SExCat)
             AstSEx = Table.read(FITS_SExCat, hdu=tbhdu)
         else: FITS_SExCat, AstSEx = None, None
-        
+
         PixA_SExCheckLst = []
         FtmpLst = FITS_SExCheckLst.copy()
         for k, FITS_SExCheck in enumerate(FITS_SExCheckLst):
@@ -691,15 +753,15 @@ def record(FITS):
                     MAGERR_TYPES = ['MAGERR_' + MAGT[4:] for MAGT in MAG_TYPES]
                     FLUX_TYPES = ['FLUX_' + MAGT[4:] for MAGT in MAG_TYPES]
                     FLUXERR_TYPES = ['FLUXERR_' + MAGT[4:] for MAGT in MAG_TYPES]
-                    
+
                     for i in range(len(MAG_TYPES)):
 
                         pcomplete = (MAGERR_TYPES[i] in USExParam) & \
                                     (FLUX_TYPES[i] in USExParam) & \
                                     (FLUXERR_TYPES[i] in USExParam)
-                        
+
                         if not pcomplete:
-                            _error_message = 'Please use complete FLUX FLUXERR MAG MAGERR ' 
+                            _error_message = 'Please use complete FLUX FLUXERR MAG MAGERR '
                             _error_message += 'in SExParam for Negative Flux Correction!'
                             raise Exception('MeLOn ERROR [%s]: %s' %(objname, _error_message))
 
@@ -708,13 +770,13 @@ def record(FITS):
                         Mask_NC = FLUX < 0.0
                         MAG_NC = -2.5*np.log10(np.abs(FLUX[Mask_NC]))
                         MAGERR_NC = 1.0857 * np.abs(FLUXERR[Mask_NC] / FLUX[Mask_NC])
-                        
+
                         AstSEx[MAG_TYPES[i]][Mask_NC] = MAG_NC
                         AstSEx[MAGERR_TYPES[i]][Mask_NC] = MAGERR_NC
                     Modify_AstSEx = True
 
             # ** b. ADD-COLUMN SEGLABEL if SEGMENTATION requested.
-            #       If some lines are discarded later, corresponding 
+            #       If some lines are discarded later, corresponding
             #       SEGLABEL will be lost in the output table.
 
             if 'SEGMENTATION' in CHECKIMAGE_TYPE:
@@ -730,17 +792,17 @@ def record(FITS):
                 w_obj = Read_WCS.RW(phr_obj, VERBOSE_LEVEL=VERBOSE_LEVEL)
                 _XY = np.array([AstSEx['X_IMAGE'], AstSEx['Y_IMAGE']]).T
                 _RD = w_obj.all_pix2world(_XY, 1)
-                AstSEx.add_column(Column(_RD[:, 0], name='X_WORLD'))          
+                AstSEx.add_column(Column(_RD[:, 0], name='X_WORLD'))
                 AstSEx.add_column(Column(_RD[:, 1], name='Y_WORLD'))
 
                 if 'XWIN_IMAGE' in USExParam:
                     if 'YWIN_IMAGE' in USExParam:
                         _XY = np.array([AstSEx['XWIN_IMAGE'], AstSEx['YWIN_IMAGE']]).T
                         _RD = w_obj.all_pix2world(_XY, 1)
-                        AstSEx.add_column(Column(_RD[:, 0], name='XWIN_WORLD'))          
+                        AstSEx.add_column(Column(_RD[:, 0], name='XWIN_WORLD'))
                         AstSEx.add_column(Column(_RD[:, 1], name='YWIN_WORLD'))
                 Modify_AstSEx = True
-            
+
             # ** d. Restriction on FLAGS
             if ONLY_FLAGS is not None:
                 if 'FLAGS' not in SExParam:
@@ -748,13 +810,13 @@ def record(FITS):
                     raise Exception('MeLOn ERROR [%s]: %s' %(objname, _error_message))
                 else:
                     _OLEN = len(AstSEx)
-                    AstSEx = AstSEx[np.in1d(AstSEx['FLAGS'], ONLY_FLAGS)]    
+                    AstSEx = AstSEx[np.in1d(AstSEx['FLAGS'], ONLY_FLAGS)]
                     Modify_AstSEx = True
 
                     if VERBOSE_LEVEL in [2]:
                         _message = 'PYSEx excludes [%d / %d] sources by FLAGS restriction!' %(_OLEN - len(AstSEx), _OLEN)
                         print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
-            
+
             # ** e. Remove Boundary Sources
             if XBoundary != 0.0 or XBoundary != 0.0:
                 NX, NY = int(phr_obj['NAXIS1']), int(phr_obj['NAXIS2'])
@@ -764,7 +826,7 @@ def record(FITS):
                                                    _XY[:, 0] < NX - XBoundary + 0.5, \
                                                    _XY[:, 1] > YBoundary + 0.5, \
                                                    _XY[:, 1] < NY - YBoundary + 0.5))
-                
+
                 _OLEN = len(AstSEx)
                 AstSEx = AstSEx[InnerMask]
                 Modify_AstSEx = True
@@ -796,11 +858,11 @@ def record(FITS):
             if RD_Quest is not None:
                 _RD = np.array([AstSEx[RAcoln_4Match], AstSEx[DECcoln_4Match]]).T
                 Symm = Sky_Symmetric_Match.SSM(RD_A=RD_Quest, RD_B=_RD, tol=Match_rdtol, return_distance=False)
-            
+
             if Symm is not None:
                 Modify_AstSEx = True
                 if Preserve_NoMatch:
-                    QuestMATCH = np.zeros(len(AstSEx)).astype(bool)        
+                    QuestMATCH = np.zeros(len(AstSEx)).astype(bool)
                     QuestMATCH[Symm[:, 1]] = True
                     AstSEx.add_column(Column(QuestMATCH, name='QuestMATCH'))
 
@@ -810,13 +872,13 @@ def record(FITS):
                 else:
                     _OLEN = len(AstSEx)
                     AstSEx = AstSEx[Symm[:, 1]]
-                    
+
                     QuestMATCH = np.ones(len(AstSEx)).astype(bool)
                     AstSEx.add_column(Column(QuestMATCH, name='QuestMATCH'))
-                    
+
                     QuestINDEX = Symm[:, 0]
                     AstSEx.add_column(Column(QuestINDEX, name='QuestINDEX'))
-                    
+
                     if VERBOSE_LEVEL in [2]:
                         _message = 'PYSEx excludes [%d / %d] sources by symmetric matching!' %(_OLEN - len(AstSEx), _OLEN)
                         print('MeLOn CheckPoint [%s]: %s' %(objname, _message))
@@ -833,7 +895,7 @@ def record(FITS):
                     FILL_VALUE=np.nan, FITS_StpLst=None, VERBOSE_LEVEL=VERBOSE_LEVEL)
                 AstSEx.add_column(Column(PixA_StpLst, name='Stamp'))
                 Modify_AstSEx = True
-            
+
             # ** UPDATE the file FITS_SExCat
             if MDIR is not None and Modify_AstSEx:
                 tFITS_SExCat = ''.join([TDIR, '/TMPCAT_%s' %FNAME])
@@ -848,7 +910,7 @@ def record(FITS):
                 os.system('rm %s' %tFITS_SExCat)
 
         # ** REMOVE temporary directory
-        if MDIR is None: 
+        if MDIR is None:
             os.system('rm -rf %s' %TDIR)
 
         return AstSEx, PixA_SExCheckLst, FITS_SExCat, FITS_SExCheckLst

From 4791ea08d8affe8d8630964fc079b57e7502de9d Mon Sep 17 00:00:00 2001
From: "Rob Knop (Nersc)" <raknop@lbl.gov>
Date: Mon, 6 Jan 2025 11:12:43 -0800
Subject: [PATCH 2/3] Add missing CudaResampling.pu

---
 sfft/utils/CudaResampling.py | 410 +++++++++++++++++++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100644 sfft/utils/CudaResampling.py

diff --git a/sfft/utils/CudaResampling.py b/sfft/utils/CudaResampling.py
new file mode 100644
index 0000000..20a738b
--- /dev/null
+++ b/sfft/utils/CudaResampling.py
@@ -0,0 +1,410 @@
+import time
+import warnings
+import cupy as cp
+import numpy as np
+from math import floor
+from astropy.wcs import WCS, FITSFixedWarning
+from sfft.utils.CupyWCSTransform import Cupy_WCS_Transform
+
+__last_update__ = "2024-09-19"
+__author__ = "Lei Hu <leihu@andrew.cmu.edu>"
+
+class Cuda_Resampling:
+    def __init__(self, RESAMP_METHOD="BILINEAR", VERBOSE_LEVEL=2):
+        """
+        Image resampling using CUDA.
+        """
+        self.RESAMP_METHOD = RESAMP_METHOD
+        self.VERBOSE_LEVEL = VERBOSE_LEVEL
+        return None
+    
+    def projection_cd(self, hdr_obj, hdr_targ, CDKEY="CD"):
+        """Mapping the target pixel centers to the object frame using Cupy for CD WCS (NO distortion)"""
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+
+        XX_targ_GPU, YY_targ_GPU = cp.meshgrid(
+            cp.arange(0, NTX) + 1., 
+            cp.arange(0, NTY) + 1., 
+            indexing='ij'
+        )
+        
+        CWT = Cupy_WCS_Transform()
+        # * perform CD transformation through target WCS
+        if True:
+            # read header [target]
+            KEYDICT, CD_GPU = CWT.read_cd_wcs(hdr_wcs=hdr_targ, CDKEY=CDKEY)
+            CRPIX1_targ, CRPIX2_targ = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            
+            # relative to reference point [target]
+            u_GPU, v_GPU = XX_targ_GPU.flatten() - CRPIX1_targ, YY_targ_GPU.flatten() - CRPIX2_targ
+            
+            # CD transformation [target]
+            x_GPU, y_GPU = CWT.cd_transform(IMAGE_X_GPU=u_GPU, IMAGE_Y_GPU=v_GPU, CD_GPU=CD_GPU)
+
+        # * perform CD^-1 transformation through object WCS
+        if True:
+            # read header [object]
+            KEYDICT, CD_GPU = CWT.read_cd_wcs(hdr_wcs=hdr_obj, CDKEY=CDKEY)
+            CRPIX1_obj, CRPIX2_obj = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+
+            # relative to reference point, consider offset between the WCS reference points [object]
+            # WARNING: the offset calculations may be wrong when the WCS reference points are very close to the N/S Poles!
+            x_GPU += (CRPIX1_targ - CRPIX1_obj + 180.0) % 360.0 - 180.0   # from using target reference point to object
+            y_GPU += CRPIX2_targ - CRPIX2_obj                             # ~
+
+            # inverse CD transformation [object]
+            u_GPU, v_GPU = CWT.cd_transform_inv(WORLD_X_GPU=x_GPU, WORLD_Y_GPU=y_GPU, CD_GPU=CD_GPU)
+            
+            # relative to image origin [object]
+            XX_proj_GPU = CRPIX1_obj + u_GPU.reshape((NTX, NTY))
+            YY_proj_GPU = CRPIX2_obj + v_GPU.reshape((NTX, NTY))
+        
+        return XX_proj_GPU, YY_proj_GPU
+
+    def projection_sip(self, hdr_obj, hdr_targ, Nsamp=1024, RANDOM_SEED=10086):
+        """Mapping the target pixel centers to the object frame using Cupy for SIP WCS"""
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+
+        XX_targ_GPU, YY_targ_GPU = cp.meshgrid(
+            cp.arange(0, NTX) + 1., 
+            cp.arange(0, NTY) + 1., 
+            indexing='ij'
+        )
+        
+        CWT = Cupy_WCS_Transform()
+        # * perform forward transformation (+CD) through target WCS
+        if True:
+            # read header [target]
+            KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU = CWT.read_sip_wcs(hdr_wcs=hdr_targ)
+            CRPIX1_targ, CRPIX2_targ = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            CRVAL1_targ, CRVAL2_targ = KEYDICT["CRVAL1"], KEYDICT["CRVAL2"]
+            
+            # relative to reference point [target]
+            u_GPU, v_GPU = XX_targ_GPU.flatten() - CRPIX1_targ, YY_targ_GPU.flatten() - CRPIX2_targ
+            
+            # forward transformation for target grid of pixel centers [target]
+            U_GPU, V_GPU = CWT.sip_forward_transform(u_GPU=u_GPU, v_GPU=v_GPU, A_SIP_GPU=A_SIP_GPU, B_SIP_GPU=B_SIP_GPU)
+            
+            # plus CD transformation [target]
+            x_GPU, y_GPU = CWT.cd_transform(IMAGE_X_GPU=U_GPU, IMAGE_Y_GPU=V_GPU, CD_GPU=CD_GPU)
+        
+        # * perform backward transformation (+CD^-1) through object WCS
+        if True:
+            # read header [object]
+            KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU = CWT.read_sip_wcs(hdr_wcs=hdr_obj)
+            CRPIX1_obj, CRPIX2_obj = KEYDICT["CRPIX1"], KEYDICT["CRPIX2"]
+            CRVAL1_obj, CRVAL2_obj = KEYDICT["CRVAL1"], KEYDICT["CRVAL2"]
+
+            # sampling random coordinates [object]
+            u_GPU, v_GPU = CWT.random_coord_sampling(N0=KEYDICT["N0"], N1=KEYDICT["N1"], 
+                CRPIX1=CRPIX1_obj, CRPIX2=CRPIX2_obj, Nsamp=Nsamp, RANDOM_SEED=RANDOM_SEED)
+            
+            # fit polynomial form backward transformation [object]
+            U_GPU, V_GPU = CWT.sip_forward_transform(u_GPU=u_GPU, v_GPU=v_GPU, A_SIP_GPU=A_SIP_GPU, B_SIP_GPU=B_SIP_GPU)
+            AP_lstsq_GPU, BP_lstsq_GPU = CWT.lstsq_sip_backward_transform(u_GPU=u_GPU, v_GPU=v_GPU, 
+                U_GPU=U_GPU, V_GPU=V_GPU, A_ORDER=KEYDICT["A_ORDER"], B_ORDER=KEYDICT["B_ORDER"])[2:4]
+
+            # relative to reference point, consider offset between the WCS reference points [object]
+            # WARNING: the offset calculations may be wrong when the WCS reference points are very close to the N/S Poles!
+            # TODO: this is not accruate, ~ 1pix.
+            offset1 = ((CRVAL1_targ - CRVAL1_obj + 180.0) % 360.0 - 180.0) * cp.cos(cp.deg2rad((CRVAL2_targ + CRVAL2_obj)/2.))
+            offset2 = CRVAL2_targ - CRVAL2_obj
+            x_GPU += offset1
+            y_GPU += offset2
+
+            # inverse CD transformation [object]
+            U_GPU, V_GPU = CWT.cd_transform_inv(WORLD_X_GPU=x_GPU, WORLD_Y_GPU=y_GPU, CD_GPU=CD_GPU)
+            
+            # backward transformation [object]
+            FP_UV_GPU = CWT.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=KEYDICT["A_ORDER"])
+            GP_UV_GPU = CWT.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=KEYDICT["B_ORDER"])
+
+            u_GPU, v_GPU = CWT.sip_backward_transform(U_GPU=U_GPU, V_GPU=V_GPU, 
+                FP_UV_GPU=FP_UV_GPU, GP_UV_GPU=GP_UV_GPU, AP_GPU=AP_lstsq_GPU, BP_GPU=BP_lstsq_GPU)
+            
+            # relative to image origin [object]
+            XX_proj_GPU = CRPIX1_obj + u_GPU.reshape((NTX, NTY))
+            YY_proj_GPU = CRPIX2_obj + v_GPU.reshape((NTX, NTY))
+
+        return XX_proj_GPU, YY_proj_GPU
+
+    def projection_astropy(self, hdr_obj, hdr_targ):
+        """Mapping the target pixel centers to the object frame using Astropy"""
+        # * read object WCS and target WCS
+        def _readWCS(hdr, VERBOSE_LEVEL=2):
+            with warnings.catch_warnings():
+                if VERBOSE_LEVEL in [0, 1]: behavior = 'ignore'
+                if VERBOSE_LEVEL in [2]: behavior = 'default'
+                warnings.filterwarnings(behavior, category=FITSFixedWarning)
+
+                if hdr['CTYPE1'] == 'RA---TAN' and 'PV1_0' in hdr:
+                    _hdr = hdr.copy()
+                    _hdr['CTYPE1'] = 'RA---TPV'
+                    _hdr['CTYPE2'] = 'DEC--TPV'
+                else: _hdr = hdr
+                w = WCS(_hdr)
+            return w
+
+        w_obj = _readWCS(hdr=hdr_obj, VERBOSE_LEVEL=self.VERBOSE_LEVEL)
+        w_targ = _readWCS(hdr=hdr_targ, VERBOSE_LEVEL=self.VERBOSE_LEVEL)
+
+        # * maaping target pixel centers to the object frame
+        NTX = int(hdr_targ["NAXIS1"]) 
+        NTY = int(hdr_targ["NAXIS2"])
+        
+        XX_targ, YY_targ = np.meshgrid(np.arange(0, NTX)+1., np.arange(0, NTY)+1., indexing='ij')
+        XY_targ = np.array([XX_targ.flatten(), YY_targ.flatten()]).T
+        XY_proj = w_obj.all_world2pix(w_targ.all_pix2world(XY_targ, 1), 1)
+
+        XX_proj = XY_proj[:, 0].reshape((NTX, NTY))
+        YY_proj = XY_proj[:, 1].reshape((NTX, NTY))
+
+        XX_proj_GPU = cp.array(XX_proj, dtype=np.float64)
+        YY_proj_GPU = cp.array(YY_proj, dtype=np.float64)
+
+        return XX_proj_GPU, YY_proj_GPU
+
+    def frame_extension(self, XX_proj_GPU, YY_proj_GPU, PixA_obj_GPU):
+        """Extend the object frame for resampling"""
+        NTX, NTY = XX_proj_GPU.shape
+        NOX, NOY = PixA_obj_GPU.shape
+
+        # * padding the object frame
+        if self.RESAMP_METHOD == 'BILINEAR':
+            KERHW = (1, 1)
+
+        if self.RESAMP_METHOD == 'LANCZOS3':
+            KERHW = (3, 3)
+
+        # find the root index and shift with maximal kernel halfwidth (KERHW)
+        # Note: the index ranges (RMIN --- RMAX) and (CMIN --- CMAX) can cover
+        #       all pixels that interpolation may be use.
+
+        RMIN = (floor(XX_proj_GPU.min().item()) - 1) - KERHW[0]
+        RMAX = (floor(XX_proj_GPU.max().item()) - 1) + KERHW[0]
+        RPAD = (-np.min([RMIN, 0]), np.max([RMAX - (NOX - 1), 0]))
+
+        CMIN = (floor(YY_proj_GPU.min().item()) - 1) - KERHW[1]
+        CMAX = (floor(YY_proj_GPU.max().item()) - 1) + KERHW[1]
+        CPAD = (-np.min([CMIN, 0]), np.max([CMAX - (NOY - 1), 0]))
+
+        PAD_WIDTH = (RPAD, CPAD)    
+        PixA_Eobj_GPU = cp.pad(PixA_obj_GPU, PAD_WIDTH, mode='constant', constant_values=0.)
+        NEOX, NEOY = PixA_Eobj_GPU.shape
+
+        XX_Eproj_GPU = XX_proj_GPU + PAD_WIDTH[0][0]
+        YY_Eproj_GPU = YY_proj_GPU + PAD_WIDTH[1][0]
+
+        RMIN_E = (floor(XX_Eproj_GPU.min().item()) - 1) - KERHW[0]
+        RMAX_E = (floor(XX_Eproj_GPU.max().item()) - 1) + KERHW[0]
+
+        CMIN_E = (floor(YY_Eproj_GPU.min().item()) - 1) - KERHW[1]
+        CMAX_E = (floor(YY_Eproj_GPU.max().item()) - 1) + KERHW[1]
+
+        assert RMIN_E >= 0 and CMIN_E >= 0 
+        assert RMAX_E < NEOX and CMAX_E < NEOY
+
+        EProjDict = {}
+        EProjDict['NTX'] = NTX
+        EProjDict['NTY'] = NTY
+
+        EProjDict['NOX'] = NOX
+        EProjDict['NOY'] = NOY
+
+        EProjDict['NEOX'] = NEOX
+        EProjDict['NEOY'] = NEOY
+
+        EProjDict['XX_Eproj_GPU'] = XX_Eproj_GPU
+        EProjDict['YY_Eproj_GPU'] = YY_Eproj_GPU
+
+        return PixA_Eobj_GPU, EProjDict
+    
+    def resampling(self, PixA_Eobj_GPU, EProjDict):
+        """Resampling the object frame to the target frame using CUDA"""
+        NTX = EProjDict['NTX']
+        NTY = EProjDict['NTY']
+
+        NEOX = EProjDict['NEOX']
+        NEOY = EProjDict['NEOY']
+
+        XX_Eproj_GPU = EProjDict['XX_Eproj_GPU']
+        YY_Eproj_GPU = EProjDict['YY_Eproj_GPU']
+
+        # * Cupy configuration
+        MaxThreadPerB = 8
+        GPUManage = lambda NT: ((NT-1)//MaxThreadPerB + 1, min(NT, MaxThreadPerB))
+        BpG_PIX0, TpB_PIX0 = GPUManage(NTX)
+        BpG_PIX1, TpB_PIX1 = GPUManage(NTY)
+        BpG_PIX, TpB_PIX = (BpG_PIX0, BpG_PIX1), (TpB_PIX0, TpB_PIX1, 1)
+
+        PixA_resamp_GPU = cp.zeros((NTX, NTY), dtype=np.float64)
+
+        if self.RESAMP_METHOD == "BILINEAR":
+        
+            # * perform bilinear resampling using CUDA
+            # input: PixA_Eobj | (NEOX, NEOY)
+            # input: XX_Eproj, YY_Eproj | (NTX, NTY)
+            # output: PixA_resamp | (NTX, NTY)
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY, 'NEOX': NEOX, 'NEOY': NEOY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double PixA_Eobj_GPU[%(NEOX)s][%(NEOY)s], double PixA_resamp_GPU[%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                int NEOX = %(NEOX)s;
+                int NEOY = %(NEOY)s;
+
+                if (ROW < NTX && COL < NTY) {
+                
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+
+                    int r1 = floor(x) - 1;
+                    int c1 = floor(y) - 1;
+                    int r2 = r1 + 1;
+                    int c2 = c1 + 1;
+
+                    double dx = x - floor(x); 
+                    double dy = y - floor(y);
+
+                    double w11 = (1-dx) * (1-dy);
+                    double w12 = (1-dx) * dy;
+                    double w21 = dx * (1-dy);
+                    double w22 = dx * dy;
+
+                    PixA_resamp_GPU[ROW][COL] = w11 * PixA_Eobj_GPU[r1][c1] + w12 * PixA_Eobj_GPU[r1][c2] + 
+                        w21 * PixA_Eobj_GPU[r2][c1] + w22 * PixA_Eobj_GPU[r2][c2];
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            resamp_func = _module.get_function('kmain')
+            
+            t0 = time.time()
+            resamp_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, PixA_Eobj_GPU, PixA_resamp_GPU), block=TpB_PIX, grid=BpG_PIX)
+            if self.VERBOSE_LEVEL in [1, 2]:
+                print('MeLOn CheckPoint: Cuda resampling takes [%.6f s]' %(time.time() - t0))
+            
+        if self.RESAMP_METHOD == "LANCZOS3":
+            
+            # * perform LANCZOS-3 resampling using CUDA
+            # input: XX_Eproj, YY_Eproj | (NTX, NTY)
+            # output: PixA_resamp | (NTX, NTY)
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double LKERNEL_X_GPU[6][%(NTX)s][%(NTY)s], double LKERNEL_Y_GPU[6][%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                
+                double PI = 3.141592653589793;
+                double PIS = 9.869604401089358;
+                
+                if (ROW < NTX && COL < NTY) {
+                    
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+                    
+                    double dx = x - floor(x); 
+                    double dy = y - floor(y);
+                    
+                    // LANCZOS3 weights in x axis
+                    double wx0 = 3.0 * sin(PI*(-2.0 - dx)) * sin(PI*(-2.0 - dx)/3.0) / (PIS*(-2.0 - dx) * (-2.0 - dx));
+                    double wx1 = 3.0 * sin(PI*(-1.0 - dx)) * sin(PI*(-1.0 - dx)/3.0) / (PIS*(-1.0 - dx) * (-1.0 - dx));
+                    double wx2 = 1.0;
+                    if (fabs(dx) > 1e-4) {
+                        wx2 = 3.0 * sin(PI*(-dx)) * sin(PI*(-dx)/3.0) / (PIS*(-dx) * (-dx));
+                    }
+                    double wx3 = 3.0 * sin(PI*(1.0 - dx)) * sin(PI*(1.0 - dx)/3.0) / (PIS*(1.0 - dx) * (1.0 - dx));
+                    double wx4 = 3.0 * sin(PI*(2.0 - dx)) * sin(PI*(2.0 - dx)/3.0) / (PIS*(2.0 - dx) * (2.0 - dx));
+                    double wx5 = 3.0 * sin(PI*(3.0 - dx)) * sin(PI*(3.0 - dx)/3.0) / (PIS*(3.0 - dx) * (3.0 - dx));
+                    
+                    LKERNEL_X_GPU[0][ROW][COL] = wx0;
+                    LKERNEL_X_GPU[1][ROW][COL] = wx1;
+                    LKERNEL_X_GPU[2][ROW][COL] = wx2;
+                    LKERNEL_X_GPU[3][ROW][COL] = wx3;
+                    LKERNEL_X_GPU[4][ROW][COL] = wx4;
+                    LKERNEL_X_GPU[5][ROW][COL] = wx5;
+                    
+                    // LANCZOS3 weights in y axis
+                    double wy0 = 3.0 * sin(PI*(-2.0 - dy)) * sin(PI*(-2.0 - dy)/3.0) / (PIS*(-2.0 - dy) * (-2.0 - dy));
+                    double wy1 = 3.0 * sin(PI*(-1.0 - dy)) * sin(PI*(-1.0 - dy)/3.0) / (PIS*(-1.0 - dy) * (-1.0 - dy));
+                    double wy2 = 1.0;
+                    if (fabs(dy) > 1e-4) {
+                        wy2 = 3.0 * sin(PI*(-dy)) * sin(PI*(-dy)/3.0) / (PIS*(-dy) * (-dy));
+                    }
+                    double wy3 = 3.0 * sin(PI*(1.0 - dy)) * sin(PI*(1.0 - dy)/3.0) / (PIS*(1.0 - dy) * (1.0 - dy));
+                    double wy4 = 3.0 * sin(PI*(2.0 - dy)) * sin(PI*(2.0 - dy)/3.0) / (PIS*(2.0 - dy) * (2.0 - dy));
+                    double wy5 = 3.0 * sin(PI*(3.0 - dy)) * sin(PI*(3.0 - dy)/3.0) / (PIS*(3.0 - dy) * (3.0 - dy));
+                    
+                    LKERNEL_Y_GPU[0][ROW][COL] = wy0;
+                    LKERNEL_Y_GPU[1][ROW][COL] = wy1;
+                    LKERNEL_Y_GPU[2][ROW][COL] = wy2;
+                    LKERNEL_Y_GPU[3][ROW][COL] = wy3;
+                    LKERNEL_Y_GPU[4][ROW][COL] = wy4;
+                    LKERNEL_Y_GPU[5][ROW][COL] = wy5;
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            weightkernel_func = _module.get_function('kmain')
+            
+            _refdict = {'NTX': NTX, 'NTY': NTY, 'NEOX': NEOX, 'NEOY': NEOY}
+            _funcstr = r"""
+            extern "C" __global__ void kmain(double XX_Eproj_GPU[%(NTX)s][%(NTY)s], double YY_Eproj_GPU[%(NTX)s][%(NTY)s], 
+                double LKERNEL_X_GPU[6][%(NTX)s][%(NTY)s], double LKERNEL_Y_GPU[6][%(NTX)s][%(NTY)s], 
+                double PixA_Eobj_GPU[%(NEOX)s][%(NEOY)s], double PixA_resamp_GPU[%(NTX)s][%(NTY)s])
+            {
+                int ROW = blockIdx.x*blockDim.x+threadIdx.x;
+                int COL = blockIdx.y*blockDim.y+threadIdx.y;
+
+                int NTX = %(NTX)s;
+                int NTY = %(NTY)s;
+                int NEOX = %(NEOX)s;
+                int NEOY = %(NEOY)s;
+
+                if (ROW < NTX && COL < NTY) {
+                
+                    double x = XX_Eproj_GPU[ROW][COL];
+                    double y = YY_Eproj_GPU[ROW][COL];
+                    
+                    int r0 = floor(x) - 3;
+                    int c0 = floor(y) - 3;
+                    
+                    for(int i = 0; i < 6; ++i){
+                        for(int j = 0; j < 6; ++j){
+                            double w = LKERNEL_X_GPU[i][ROW][COL] * LKERNEL_Y_GPU[j][ROW][COL];
+                            PixA_resamp_GPU[ROW][COL] += w * PixA_Eobj_GPU[r0 + i][c0 + j];
+                        }
+                    }
+                }
+            }
+            """
+            _code = _funcstr % _refdict
+            _module = cp.RawModule(code=_code, backend=u'nvcc', translate_cucomplex=False)
+            resamp_func = _module.get_function('kmain')
+            
+            t0 = time.time()
+            LKERNEL_X_GPU = cp.zeros((6, NTX, NTY), dtype=np.float64)
+            LKERNEL_Y_GPU = cp.zeros((6, NTX, NTY), dtype=np.float64)
+            weightkernel_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, LKERNEL_X_GPU, LKERNEL_Y_GPU), block=TpB_PIX, grid=BpG_PIX)
+            resamp_func(args=(XX_Eproj_GPU, YY_Eproj_GPU, LKERNEL_X_GPU, LKERNEL_Y_GPU, 
+                PixA_Eobj_GPU, PixA_resamp_GPU), block=TpB_PIX, grid=BpG_PIX)
+            if self.VERBOSE_LEVEL in [1, 2]:
+                print('MeLOn CheckPoint: Cuda resampling takes [%.6f s]' %(time.time() - t0))
+
+        return PixA_resamp_GPU

From eee96453cd40e12e62c5251d69fd43ae533a8c95 Mon Sep 17 00:00:00 2001
From: "Rob Knop (Nersc)" <raknop@lbl.gov>
Date: Tue, 7 Jan 2025 10:05:58 -0800
Subject: [PATCH 3/3] Add missing CupyWCSTransform.py

---
 sfft/utils/CupyWCSTransform.py | 241 +++++++++++++++++++++++++++++++++
 1 file changed, 241 insertions(+)
 create mode 100644 sfft/utils/CupyWCSTransform.py

diff --git a/sfft/utils/CupyWCSTransform.py b/sfft/utils/CupyWCSTransform.py
new file mode 100644
index 0000000..ceb47b0
--- /dev/null
+++ b/sfft/utils/CupyWCSTransform.py
@@ -0,0 +1,241 @@
+import cupy as cp
+
+__last_update__ = "2024-09-19"
+__author__ = "Shu Liu <shl159@pitt.edu> & Lei Hu <leihu@andrew.cmu.edu>"
+
+class Cupy_WCS_Transform:
+    def __init__(self):
+        return None
+
+    def read_cd_wcs(self, hdr_wcs, CDKEY="CD"):
+        """
+        # * Note on the CD matrix transformation:
+        The sky coordinate (x, y) relative to reference point can be connected with 
+        the image coordinate (u, v) relative to reference point by the CD matrix:
+            [x]   [CD1_1 CD1_2] [u]
+            [y] = [CD2_1 CD2_2] [v]
+        where CD1_1, CD1_2, CD2_1, CD2_2 are stored in the FITS header.
+        
+        """
+        assert hdr_wcs["CTYPE1"] == "RA---TAN"
+        assert hdr_wcs["CTYPE2"] == "DEC--TAN"
+
+        N0 = int(hdr_wcs["NAXIS1"])
+        N1 = int(hdr_wcs["NAXIS2"])
+
+        CRPIX1 = float(hdr_wcs["CRPIX1"])
+        CRPIX2 = float(hdr_wcs["CRPIX2"])
+
+        CRVAL1 = float(hdr_wcs["CRVAL1"])
+        CRVAL2 = float(hdr_wcs["CRVAL2"])
+
+        KEYDICT = {
+            "N0": N0, "N1": N1, 
+            "CRPIX1": CRPIX1, "CRPIX2": CRPIX2,
+            "CRVAL1": CRVAL1, "CRVAL2": CRVAL2
+        }
+
+        CD1_1 = hdr_wcs[f"{CDKEY}1_1"]
+        CD1_2 = hdr_wcs[f"{CDKEY}1_2"] if f"{CDKEY}1_2" in hdr_wcs else 0.
+        CD2_1 = hdr_wcs[f"{CDKEY}2_1"] if f"{CDKEY}2_1" in hdr_wcs else 0.
+        CD2_2 = hdr_wcs[f"{CDKEY}2_2"]
+
+        CD_GPU = cp.array([
+            [CD1_1, CD1_2], 
+            [CD2_1, CD2_2]
+        ], dtype=cp.float64)
+        
+        return KEYDICT, CD_GPU
+
+    def read_sip_wcs(self, hdr_wcs):
+        """
+        # * Note on the SIP transformation:
+        The image coordinate (u, v) relative to reference point and we have undistorted image coordinate (U, V):
+            U = u + f(u, v)
+            V = v + g(u, v)
+        where f(u, v) and g(u, v) are the SIP distortion functions with polynomial form:
+            f(u, v) = sum_{p, q} A_{pq} u^p v^q, p + q <= A_ORDER
+            g(u, v) = sum_{p, q} B_{pq} u^p v^q, p + q <= B_ORDER
+        These coefficients are stored in the FITS header and define a foward transformation from (u, v) to (U, V). 
+        
+        The sky coordinate (x, y) relative to reference point can be connected with (U, V) by the reversible CD matrix:
+            [x]   [CD1_1 CD1_2] [U]
+            [y] = [CD2_1 CD2_2] [V]
+        So the forward transformation & CD matrix (with considering the reference point) is equivalent to a pix2world function.
+        
+        The reverse question is how to obtain the backward transformation from undistorted (U, V) to distorted (u, v).
+        Once we have the backward transformation, we can combine a reversed CD matrix 
+        (with considering the reference point) to get the world2pix function.
+
+        A simple approach (not accurate) is to assume the forward transformation also has a polynomial form:
+            u = U + fp(U, V)
+            v = V + gp(U, V)
+        where 
+            fp(U, V) = sum_{p, q} AP_{pq} U^p V^q, p + q <= A_ORDER
+            gp(U, V) = sum_{p, q} BP_{pq} U^p V^q, p + q <= B_ORDER
+        
+        The coefficients AP_{pq} and BP_{pq} can be obtained by solving the linear equations separately.
+            [u - U] = [U^p*V^q] [AP_{pq}]
+            
+            shapes: 
+            b: [u - U]   | (N, 1), 
+            A: [U^p*V^q] | (N, (A_ORDER+1)*(A_ORDER+2)/2)
+            x: [AP_{pq}] | ((A_ORDER+1)*(A_ORDER+2)/2, 1)
+            
+            [v - V] = [U^p*V^q] [BP_{pq}]
+            b: [v - V]   | (N, 1),
+            A: [U^p*V^q] | (N, (B_ORDER+1)*(B_ORDER+2)/2)
+            x: [BP_{pq}] | ((B_ORDER+1)*(B_ORDER+2)/2, 1)
+
+            The two matrices encode the "backward" transformation in fp(U, V) and gp(U, V).
+            Let's denote the two matrices as FP_UV and GP_UV, respectively.
+
+        # * A WCS example with SIP distortion
+        CTYPE1  = 'RA---TAN-SIP'
+        CTYPE2  = 'DEC--TAN-SIP'
+        
+        CRPIX1  =               2044.0 
+        CRPIX2  =               2044.0 
+        
+        CD1_1   = 2.65458074767927E-05
+        CD1_2   = -1.2630331175158E-05 
+        CD2_1   = 1.38917634264203E-05 
+        CD2_2   = 2.57785917553667E-05
+        
+        CRVAL1  =    9.299707734492053 
+        CRVAL2  =  -43.984663860136145
+
+        A_ORDER =                    4 
+        A_0_2   =     -4.774423702E-10 
+        A_0_3   =     -2.462744787E-14 
+        A_0_4   =       2.28913156E-17 
+        A_1_1   =      1.076615801E-09 
+        A_1_2   =     -9.939904553E-14 
+        A_1_3   =     -5.845336863E-17 
+        A_2_0   =     -3.717865118E-10 
+        A_2_1   =     -4.966118494E-15 
+        A_2_2   =      6.615859199E-17 
+        A_3_0   =     -3.817793356E-15 
+        A_3_1   =      3.310020049E-17 
+        A_4_0   =     -5.166048303E-19 
+        
+        B_ORDER =                    4 
+        B_0_2   =      1.504792792E-09 
+        B_0_3   =     -2.662833066E-15 
+        B_0_4   =     -8.383877471E-20 
+        B_1_1   =      1.204553316E-10 
+        B_1_2   =       6.57748238E-14 
+        B_1_3   =     -6.998508554E-18 
+        B_2_0   =      4.136013664E-10 
+        B_2_1   =      5.339208582E-14 
+        B_2_2   =      6.063403412E-17 
+        B_3_0   =      1.486316541E-14 
+        B_3_1   =     -9.102668806E-17 
+        B_4_0   =     -5.174323631E-17
+
+        """
+        assert hdr_wcs["CTYPE1"] == "RA---TAN-SIP"
+        assert hdr_wcs["CTYPE2"] == "DEC--TAN-SIP"
+
+        N0 = int(hdr_wcs["NAXIS1"])
+        N1 = int(hdr_wcs["NAXIS2"])
+
+        CRPIX1 = float(hdr_wcs["CRPIX1"])
+        CRPIX2 = float(hdr_wcs["CRPIX2"])
+
+        CRVAL1 = float(hdr_wcs["CRVAL1"])
+        CRVAL2 = float(hdr_wcs["CRVAL2"])
+
+        A_ORDER = int(hdr_wcs["A_ORDER"])
+        B_ORDER = int(hdr_wcs["B_ORDER"])
+
+        KEYDICT = {
+            "N0": N0, "N1": N1, 
+            "CRPIX1": CRPIX1, "CRPIX2": CRPIX2,
+            "CRVAL1": CRVAL1, "CRVAL2": CRVAL2, 
+            "A_ORDER": A_ORDER, "B_ORDER": B_ORDER
+        }
+
+        CD_GPU = cp.array([
+            [hdr_wcs["CD1_1"], hdr_wcs["CD1_2"]], 
+            [hdr_wcs["CD2_1"], hdr_wcs["CD2_2"]]
+        ], dtype=cp.float64)
+
+        A_SIP_GPU = cp.zeros((A_ORDER+1, A_ORDER+1), dtype=cp.float64)
+        B_SIP_GPU = cp.zeros((B_ORDER+1, B_ORDER+1), dtype=cp.float64)
+
+        for p in range(A_ORDER + 1):
+            for q in range(0, A_ORDER - p + 1):
+                keyword = f"A_{p}_{q}"
+                if keyword in hdr_wcs:
+                    A_SIP_GPU[p, q] = hdr_wcs[keyword]
+        
+        for p in range(B_ORDER + 1):
+            for q in range(0, B_ORDER - p + 1):
+                keyword = f"B_{p}_{q}"
+                if keyword in hdr_wcs:
+                    B_SIP_GPU[p, q] = hdr_wcs[keyword]
+        
+        return KEYDICT, CD_GPU, A_SIP_GPU, B_SIP_GPU
+
+    def cd_transform(self, IMAGE_X_GPU, IMAGE_Y_GPU, CD_GPU):
+        """CD matrix transformation from image to world"""
+        WORLD_X_GPU, WORLD_Y_GPU = cp.matmul(CD_GPU, cp.array([IMAGE_X_GPU, IMAGE_Y_GPU]))
+        return WORLD_X_GPU, WORLD_Y_GPU
+    
+    def cd_transform_inv(self, WORLD_X_GPU, WORLD_Y_GPU, CD_GPU):
+        """CD matrix transformation from world to image"""
+        # CD_inv_GPU = cp.linalg.inv(CD_GPU)
+        CD_inv_GPU = 1. / (CD_GPU[0, 0] * CD_GPU[1, 1] - CD_GPU[0, 1] * CD_GPU[1, 0]) * \
+            cp.array([[CD_GPU[1, 1], -CD_GPU[0, 1]], [-CD_GPU[1, 0], CD_GPU[0, 0]]])
+        IMAGE_X_GPU, IMAGE_Y_GPU = cp.matmul(CD_inv_GPU, cp.array([WORLD_X_GPU, WORLD_Y_GPU]))
+        return IMAGE_X_GPU, IMAGE_Y_GPU
+    
+    def sip_forward_transform(self, u_GPU, v_GPU, A_SIP_GPU, B_SIP_GPU):
+        """Forward SIP transformation from (u, v) to (U, V)"""
+        A_ORDER = A_SIP_GPU.shape[0] - 1
+        U_GPU = u_GPU + cp.zeros_like(u_GPU)
+        for p in range(A_ORDER + 1):
+            for q in range(A_ORDER - p + 1):
+                U_GPU += A_SIP_GPU[p, q] * u_GPU**p * v_GPU**q
+            
+        B_ORDER = B_SIP_GPU.shape[0] - 1
+        V_GPU = v_GPU + cp.zeros_like(v_GPU)
+        for p in range(B_ORDER + 1):
+            for q in range(B_ORDER - p + 1):
+                V_GPU += B_SIP_GPU[p, q] * u_GPU**p * v_GPU**q
+        return U_GPU, V_GPU
+
+    def sip_backward_matrix(self, U_GPU, V_GPU, ORDER):
+        """Compute the backward matrix P_UV (FP_UV or GP_UV)"""
+        Nsamp = len(U_GPU)
+        P_UV_GPU = cp.zeros((Nsamp, (ORDER+1)*(ORDER+2)//2), dtype=cp.float64)
+        idx = 0
+        for p in range(ORDER + 1):
+            for q in range(ORDER - p + 1):
+                P_UV_GPU[:, idx] = U_GPU**p * V_GPU**q
+                idx += 1
+        return P_UV_GPU
+
+    def lstsq_sip_backward_transform(self, u_GPU, v_GPU, U_GPU, V_GPU, A_ORDER, B_ORDER):
+        """Compute the backward transformation from (U, V) to (u, v)"""
+        FP_UV_GPU = self.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=A_ORDER)
+        GP_UV_GPU = self.sip_backward_matrix(U_GPU=U_GPU, V_GPU=V_GPU, ORDER=B_ORDER)
+
+        AP_lstsq_GPU = cp.linalg.lstsq(FP_UV_GPU, (u_GPU - U_GPU).reshape(-1, 1), rcond=None)[0]
+        BP_lstsq_GPU = cp.linalg.lstsq(GP_UV_GPU, (v_GPU - V_GPU).reshape(-1, 1), rcond=None)[0]
+        return FP_UV_GPU, GP_UV_GPU, AP_lstsq_GPU, BP_lstsq_GPU
+
+    def sip_backward_transform(self, U_GPU, V_GPU, FP_UV_GPU, GP_UV_GPU, AP_GPU, BP_GPU):
+        """Backward transformation from (U, V) to (u, v)"""
+        u_GPU = U_GPU + cp.matmul(FP_UV_GPU, AP_GPU)[:, 0]
+        v_GPU = V_GPU + cp.matmul(GP_UV_GPU, BP_GPU)[:, 0]
+        return u_GPU, v_GPU
+
+    def random_coord_sampling(self, N0, N1, CRPIX1, CRPIX2, Nsamp=1024, RANDOM_SEED=10086):
+        """Random sampling of coordinates"""
+        if RANDOM_SEED is not None:
+            cp.random.seed(RANDOM_SEED)
+        u_GPU = cp.random.uniform(0.5, N0+0.5, Nsamp, dtype=cp.float64) - CRPIX1
+        v_GPU = cp.random.uniform(0.5, N1+0.5, Nsamp, dtype=cp.float64) - CRPIX2
+        return u_GPU, v_GPU