diff --git a/README.md b/README.md
index 908c347..bec8d06 100644
--- a/README.md
+++ b/README.md
@@ -200,6 +200,9 @@ Spatial regularization (with distributed multi-directional calibration) enables
 
 After each solution, images showing the spatial model (amplitude) will be created as ```.PPM``` files.
 
+#### 5a) Diffuse sky models in calibration
+When a spatial model is enabled, it is also possible to apply the spatial model onto a model of a diffuse sky background. The diffuse sky model can have any number of shapelet components. They all should belong to one cluster (say cluster *45*). With ```-D``` option, it is possible to enable the use of diffuse sky model with the spatial model, like ```-D 45,0.1```, where *0.1* is the regularization factor used while applying the spatial model. The larger this regularization is, the more strongly the spatial model will be enforced onto the diffuse sky model.
+
 ### 6) Solution format
 All SAGECal solutions are stored as text files. Lines starting with '#' are comments.
 The first non-comment line includes some general information, i.e.
diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp
index 218bfd2..eb153b5 100644
--- a/src/MPI/main.cpp
+++ b/src/MPI/main.cpp
@@ -34,7 +34,7 @@ using namespace Data;
 
 void
 print_copyright(void) {
-  cout<<"SAGECal-MPI 0.8.2 (C) 2011-2024 Sarod Yatawatta"<<endl;
+  cout<<"SAGECal-MPI 0.8.3 (C) 2011-2024 Sarod Yatawatta"<<endl;
 }
 
 
diff --git a/src/MPI/sagecal_slave.cpp b/src/MPI/sagecal_slave.cpp
index 026712e..5bb1939 100644
--- a/src/MPI/sagecal_slave.cpp
+++ b/src/MPI/sagecal_slave.cpp
@@ -230,6 +230,20 @@ cerr<<"Error: Worker "<<myrank<<": Recheck your allocation or reduce number of w
       }
      }
 
+#ifdef DEBUG
+    /* open text files for each MS, each line re,im XX,XY,YX,YY */
+    vector<FILE *> debug_vec(mymscount);
+    if (Data::spatialreg && sp_diffuse_id>=0) {
+         for(int cm=0; cm<mymscount; cm++) {
+           string filebuff=std::string(myms[cm])+std::string(".coh.txt\0");
+           if ((debug_vec[cm]=fopen(filebuff.c_str(),"w+"))==0) {
+             fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
+             exit(1);
+            }
+         }
+    }
+#endif /* DEBUG */
+
     vector<double *> p_vec(mymscount);
     vector<double **> pm_vec(mymscount);
 
@@ -657,7 +671,28 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
 
         /* Re-calculate model for cluster id 'sp_diffuse_id' */
         for(int cm=0; cm<mymscount; cm++) {
-          recalculate_diffuse_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,sp_diffuse_id,sh_n0,sh_beta,&Zb[cm*4*iodata_vec[0].N*G],Data::Nt);
+#ifdef HAVE_CUDA
+     if (GPUpredict) {
+          recalculate_diffuse_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,sp_diffuse_id,sh_n0,sh_beta,&Zb[cm*4*iodata_vec[0].N*G],Data::Nt,1);
+     } else {
+          recalculate_diffuse_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,sp_diffuse_id,sh_n0,sh_beta,&Zb[cm*4*iodata_vec[0].N*G],Data::Nt,0);
+     }
+#else 
+          recalculate_diffuse_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,sp_diffuse_id,sh_n0,sh_beta,&Zb[cm*4*iodata_vec[0].N*G],Data::Nt,0);
+#endif /* HAVE_CUDA */
+
+#ifdef DEBUG
+          /* save calculated coherencies in text file, re,im XX,XY,YX,YY,
+           * note that coherencies need to be multiplied by the solutions to make sense */
+          if (admm>=Nadmm-Data::admm_cadence) {
+          for (int nb=0; nb<iodata_vec[cm].Nbase*iodata_vec[cm].tilesz; nb++) {
+            fprintf(debug_vec[cm],"%e %e %e %e %e %e %e %e\n",creal(coh_vec[cm][4*M*nb+4*sp_diffuse_id]),cimag(coh_vec[cm][4*M*nb+4*sp_diffuse_id]),
+            creal(coh_vec[cm][4*M*nb+4*sp_diffuse_id+1]),cimag(coh_vec[cm][4*M*nb+4*sp_diffuse_id+1]),
+            creal(coh_vec[cm][4*M*nb+4*sp_diffuse_id+2]),cimag(coh_vec[cm][4*M*nb+4*sp_diffuse_id+2]),
+            creal(coh_vec[cm][4*M*nb+4*sp_diffuse_id+3]),cimag(coh_vec[cm][4*M*nb+4*sp_diffuse_id+3]));
+          }
+          }
+#endif /* DEBUG */
         }
       }
       /************************************************************************/
@@ -1106,6 +1141,13 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
       free(Zspat);
       free(Zb);
       free(B);
+
+#ifdef DEBUG
+      /* close files */
+      for(int cm=0; cm<mymscount; cm++) {
+         fclose(debug_vec[cm]);
+      }
+#endif /* DEBUG */
   }
   /**********************************************************/
 
diff --git a/src/MS/main.cpp b/src/MS/main.cpp
index e0c792b..c027514 100644
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@@ -35,7 +35,7 @@ using namespace Data;
 
 void
 print_copyright(void) {
-  cout<<"SAGECal 0.8.2 (C) 2011-2024 Sarod Yatawatta"<<endl;
+  cout<<"SAGECal 0.8.3 (C) 2011-2024 Sarod Yatawatta"<<endl;
 }
 
 
diff --git a/src/buildsky/annotate.py b/src/buildsky/annotate.py
index 50bfe25..a060df1 100755
--- a/src/buildsky/annotate.py
+++ b/src/buildsky/annotate.py
@@ -193,7 +193,7 @@ def annotate_lsm_sky(infilename,clusterfilename,outfilename,clid=None,color='yel
     v=pp.search(eachline)
     if v!= None:
        # iterate over list of source names (names can also have a '.')
-       CL[str(v.group('col1'))]=re.split('[^a-zA-Z0-9_\.]+',re.sub('\n','',str(v.group('col3'))))
+       CL[str(v.group('col1'))]=re.split(r'[^a-zA-Z0-9_\.]+',re.sub('\n','',str(v.group('col3'))))
 
   print('Read %d clusters'%len(CL))
 
diff --git a/src/lib/Dirac/Dirac_common.h b/src/lib/Dirac/Dirac_common.h
index 29e4c74..57dad6f 100644
--- a/src/lib/Dirac/Dirac_common.h
+++ b/src/lib/Dirac/Dirac_common.h
@@ -654,6 +654,8 @@ __attribute__ ((target(MIC)))
 #endif
 extern void
 my_dscal(int N, double a, double *x);
+extern void
+my_dscal_inc(int N, double a, double *x, int inc);
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
@@ -673,6 +675,8 @@ __attribute__ ((target(MIC)))
 #endif
 extern double
 my_dnrm2(int N, double *x);
+extern double
+my_dnrm2_inc(int N, double *x, int inc);
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
diff --git a/src/lib/Dirac/fista.c b/src/lib/Dirac/fista.c
index cb23de0..47c523c 100644
--- a/src/lib/Dirac/fista.c
+++ b/src/lib/Dirac/fista.c
@@ -21,6 +21,8 @@
 #include <string.h>
 #include "Dirac.h"
 
+#define FISTA_L_MIN 1e2
+#define FISTA_L_MAX 1e7
 
 /* 
  * Z = arg min \| Z_k - Z Phi_k\|^2 + \lambda \|Z\|^2 + \mu \|Z\|_1
@@ -42,6 +44,11 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double
   complex double *Zold,*Y;
   /* Lipschitz constant of gradient, use ||Phikk||^2 as estimate */
   double L=my_cdot(2*G*2*G,Phikk,Phikk);
+  /* if 1/L too large, might diverge, so catch it */
+  if (L<FISTA_L_MIN) { L=FISTA_L_MIN; }
+  /* if 1/L too small, will give zero solution, so catch it */
+  if (L>FISTA_L_MAX) { L=FISTA_L_MAX; }
+
   /* intial t */
   double t=1.0;
   if ((gradf=(complex double*)calloc((size_t)2*Npoly*N*2*G,sizeof(complex double)))==0) {
@@ -75,7 +82,7 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double
     /* take gradient descent step Y - 1/L gradf */
     my_caxpy(2*Npoly*N*2*G, gradf, -1.0/L, Y);
     /* soft threshold and update Z */
-    double thresh=t*mu;
+    double thresh=mu/L;
     for (int ci=0; ci<2*Npoly*N*2*G; ci++) {
        double r=creal(Y[ci]);
        double r1=fabs(r)-thresh; 
@@ -90,12 +97,13 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double
     }
     double t0=t;
     t=(1.0+sqrt(1.0+4.0*t*t))*0.5;
-    /* update Y = Z + (t-1)/told (Z-Zold) = (1+(t-1)/told) Z - (t-1)/told Zold */
+    /* Zold <= Zold-Z */
+    my_caxpy(2*Npoly*N*2*G, Z, -1.0, Zold);
+    printf("FISTA %d ||grad||=%lf ||Z-Zold||=%lf\n",it,my_dnrm2(2*2*Npoly*N*2*G,(double*)gradf),my_dnrm2(2*2*Npoly*N*2*G,(double*)Zold)/my_dnrm2(2*2*Npoly*N*2*G,(double*)Z));
+    /* update Y = Z + (told-1)/t(Z-Zold) */
     memcpy(Y,Z,2*Npoly*N*2*G*sizeof(complex double));
-    double scalefac=(t-1.0)/t0;
-    my_cscal(2*Npoly*N*2*G,1.0+scalefac,Y);
+    double scalefac=(t0-1.0)/t;
     my_caxpy(2*Npoly*N*2*G, Zold, -scalefac, Y);
-    //printf("%lf %lf %lf %lf %lf\n",t,creal(Y[10]),cimag(Y[10]),creal(Z[10]),cimag(Z[10]));
   }
 
   free(gradf);
@@ -130,6 +138,11 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z
   complex double *Zold,*Y;
   /* Lipschitz constant of gradient, use ||Phikk||^2 as estimate */
   double L=my_cdot(2*G*2*G,Phikk,Phikk);
+  /* if 1/L too large, might diverge, so catch it */
+  if (L<FISTA_L_MIN) { L=FISTA_L_MIN; }
+  /* if 1/L too small, will give zero solution, so catch it */
+  if (L>FISTA_L_MAX) { L=FISTA_L_MAX; }
+
   /* intial t */
   double t=1.0;
   if ((gradf=(complex double*)calloc((size_t)2*Npoly*N*2*G,sizeof(complex double)))==0) {
@@ -173,7 +186,7 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z
     /* take gradient descent step Y - 1/L gradf */
     my_caxpy(2*Npoly*N*2*G, gradf, -1.0/L, Y);
     /* soft threshold and update Z */
-    double thresh=t*mu;
+    double thresh=mu/L;
     for (int ci=0; ci<2*Npoly*N*2*G; ci++) {
        double r=creal(Y[ci]);
        double r1=fabs(r)-thresh; 
@@ -188,12 +201,13 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z
     }
     double t0=t;
     t=(1.0+sqrt(1.0+4.0*t*t))*0.5;
-    /* update Y = Z + (t-1)/told (Z-Zold) = (1+(t-1)/told) Z - (t-1)/told Zold */
+    /* Zold=Z-Zold */
+    my_caxpy(2*Npoly*N*2*G, Z, -1.0, Zold);
+    printf("FISTA %d ||grad||=%lf ||Z-Zold||=%lf\n",it,my_dnrm2(2*2*Npoly*N*2*G,(double*)gradf),my_dnrm2(2*2*Npoly*N*2*G,(double*)Zold)/my_dnrm2(2*2*Npoly*N*2*G,(double*)Z));
+    /* update Y = Z + (told-1)/t(Z-Zold) */
     memcpy(Y,Z,2*Npoly*N*2*G*sizeof(complex double));
-    double scalefac=(t-1.0)/t0;
-    my_cscal(2*Npoly*N*2*G,1.0+scalefac,Y);
+    double scalefac=(t0-1.0)/t;
     my_caxpy(2*Npoly*N*2*G, Zold, -scalefac, Y);
-    //printf("%lf %lf %lf %lf %lf\n",t,creal(Y[10]),cimag(Y[10]),creal(Z[10]),cimag(Z[10]));
   }
 
   free(gradf);
diff --git a/src/lib/Dirac/myblas.c b/src/lib/Dirac/myblas.c
index 38603cd..ec81fe2 100644
--- a/src/lib/Dirac/myblas.c
+++ b/src/lib/Dirac/myblas.c
@@ -61,6 +61,11 @@ __attribute__ ((target(MIC)))
   dscal_(&N,&a,x,&i);
 }
 void
+my_dscal_inc(int N, double a, double *x, int inc) {
+  extern void dscal_(int *N, double *alpha, double *x, int *incx);
+  dscal_(&N,&a,x,&inc);
+}
+void
 my_sscal(int N, float a, float *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
@@ -91,6 +96,14 @@ __attribute__ ((target(MIC)))
   int i=1;
   return(dnrm2_(&N,x,&i));
 }
+double
+my_dnrm2_inc(int N, double *x, int inc) {
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+  extern double  dnrm2_(int *N, double *x, int *incx);
+  return(dnrm2_(&N,x,&inc));
+}
 float
 my_fnrm2(int N, float *x) {
 #ifdef USE_MIC
diff --git a/src/lib/Radio/Dirac_radio.h b/src/lib/Radio/Dirac_radio.h
index 36433cf..92002a2 100644
--- a/src/lib/Radio/Dirac_radio.h
+++ b/src/lib/Radio/Dirac_radio.h
@@ -223,9 +223,10 @@ precalculate_coherencies_multifreq(double *u, double *v, double *w, complex doub
 
 
 /****************************** diffuse_predict.c ****************************/
+/* have_cuda: if 1, use GPU version, else only CPU version */
 extern int
 recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double *x, int N,
-   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int diffuse_cluster, int sh_n0, double sh_beta, complex double *Z, int Nt);
+   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int diffuse_cluster, int sh_n0, double sh_beta, complex double *Z, int Nt, int use_cuda);
 /****************************** transforms.c ****************************/
 #ifndef ASEC2RAD
 #define ASEC2RAD 4.848136811095359935899141e-6
@@ -477,6 +478,11 @@ precess_source_locations_deprecated(double jd_tdb, clus_source_t *carr, int M, d
 
 /****************************** predict_withbeam_cuda.c ****************************/
 #ifdef HAVE_CUDA
+/* copy Nx1 double array x to device as float
+   first allocate device memory (need to be freed later) */
+extern void
+dtofcopy(int N, float **x_d, double *x);
+
 /* if dobeam==0, beam calculation is off
    else, flag to determine if full (element+array), array only, or element only beam is calculated
  */
@@ -544,6 +550,9 @@ cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk,
 
 extern void
 cudakernel_convert_time(int T, double *time_utc);
+
+extern void
+cudakernel_calculate_shapelet_coherencies(float u, float v, float *modes, float *fact, int n0, float beta, double *coh);
 #endif /* !HAVE_CUDA */
 
 
diff --git a/src/lib/Radio/diffuse_predict.c b/src/lib/Radio/diffuse_predict.c
index ec11ad2..95ad9a4 100644
--- a/src/lib/Radio/diffuse_predict.c
+++ b/src/lib/Radio/diffuse_predict.c
@@ -24,6 +24,9 @@
 
 #include "Dirac.h"
 #include "Dirac_radio.h"
+#ifdef HAVE_CUDA
+#include "Dirac_GPUtune.h"
+#endif
 #include <math.h>
 
 typedef struct thread_data_shap_ {
@@ -45,6 +48,11 @@ typedef struct thread_data_shap_ {
   complex double *modes; /* all modes, 4*n0*n0*stations^2 (only half is calculated) */
   int modes_n0; /* basis n0xn0 */
   double modes_beta; /* scale */
+
+#ifdef HAVE_CUDA
+  int tid; /* this thread id */
+  taskhist *hst; /* for load balancing GPUs */
+#endif /* HAVE_CUDA */
 } thread_data_shap_t;
 
 
@@ -117,6 +125,137 @@ shapelet_pred_threadfn(void *data) {
   return NULL;
 }
 
+/*****************************************************************************/
+#ifdef HAVE_CUDA
+#define CUDA_DEBUG
+static void
+checkCudaError(cudaError_t err, char *file, int line)
+{
+#ifdef CUDA_DEBUG
+    if(!err)
+        return;
+    fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
+    exit(EXIT_FAILURE);
+#endif
+}
+
+static void *
+shapelet_pred_threadfn_cuda(void *data) {
+  thread_data_shap_t *t=(thread_data_shap_t*)data;
+
+  /* kernel will spawn threads to cover baselines t->Nb,
+   * which can be lower than the total baselines */
+  int card;
+  card=select_work_gpu(MAX_GPU_ID,t->hst);
+
+  cudaError_t err;
+  err=cudaSetDevice(card);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  /* allocate device mem */
+  float *modesd; /* shapelet modes 4*n0*n0 complex (float) values */
+  double *cohd; /* coherencies 8x1 */
+  float *factd;
+  
+  err=cudaMalloc((void**) &cohd, 8*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &factd, t->modes_n0*sizeof(float));
+  checkCudaError(err,__FILE__,__LINE__);
+
+  complex double coh[4];
+  int cm=t->cid;
+  int sid=t->sid;
+  double freq0=t->freq0;
+
+  double fdelta2=t->fdelta*0.5;
+
+  /* set up factorial array */
+  float *fact;
+  if ((fact=(float *)calloc((size_t)(t->modes_n0),sizeof(float)))==0) {
+    fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+    exit(1);
+  }
+  fact[0]=1.0f;
+  for (int ci=1; ci<(t->modes_n0); ci++) {
+    fact[ci]=((float)ci)*fact[ci-1];
+  }
+  err=cudaMemcpy(factd, fact, t->modes_n0*sizeof(float), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  free(fact);
+
+  /* we only predict for cluster id cm, and for only one source sid,
+   * so source flux and l,m,n coords are scalars */
+   /* we copy u,v,w,l,m,n values to GPU and perform calculation per-baseline,
+    * CUDA threads parallelize over the modes : n0xn0 ~ large value */
+  for (int ci=0; ci<t->Nb; ci++) {
+   int stat1=t->barr[ci+t->boff].sta1;
+   int stat2=t->barr[ci+t->boff].sta2;
+
+   double Gn=2.0*M_PI*(t->u[ci]*t->carr[cm].ll[sid]+t->v[ci]*t->carr[cm].mm[sid]+t->w[ci]*t->carr[cm].nn[sid]);
+   double sin_n,cos_n;
+   sincos(freq0*Gn,&sin_n,&cos_n);
+   /* freq smearing */
+   if (Gn!=0.0) {
+     double smfac=Gn*fdelta2;
+     Gn =fabs(sin(smfac)/smfac);
+   } else {
+     Gn =1.0;
+   }
+   /* multiply (re, im) phase term with smear factor */
+   sin_n*=Gn;
+   cos_n*=Gn;
+
+   /* shapelet contribution */
+   /* modes: n0*n0*4 values &Jp_C_Jq[4*n0*n0*(stat1*t->N+stat2)] */
+   complex double *modes=&(t->modes[4*t->modes_n0*t->modes_n0*(stat1*t->N+stat2)]);
+   //shapelet_contrib_vector(modes,t->modes_n0,t->modes_beta,t->u[ci]*freq0,t->v[ci]*freq0,t->w[ci]*freq0,coh);
+
+
+   dtofcopy(8*t->modes_n0*t->modes_n0,&modesd,(double*)modes);
+   cudakernel_calculate_shapelet_coherencies((float)t->u[ci]*freq0,(float)t->v[ci]*freq0,modesd,factd,t->modes_n0,(float)t->modes_beta,cohd);
+   err=cudaFree(modesd);
+   checkCudaError(err,__FILE__,__LINE__);
+
+   err=cudaMemcpy((double*)coh, cohd, sizeof(double)*8, cudaMemcpyDeviceToHost);
+   checkCudaError(err,__FILE__,__LINE__);
+
+
+   complex double phterm=cos_n+_Complex_I*sin_n;
+
+   coh[0]*=phterm;
+   coh[1]*=phterm;
+   coh[2]*=phterm;
+   coh[3]*=phterm;
+
+   /* add or replace coherencies for this cluster */
+   if (t->sid==0) {
+     /* first source will replace, resetting the accumulation to start from 0 */
+     t->coh[4*t->M*ci+4*t->cid]=coh[0];
+     t->coh[4*t->M*ci+4*t->cid+1]=coh[1];
+     t->coh[4*t->M*ci+4*t->cid+2]=coh[2];
+     t->coh[4*t->M*ci+4*t->cid+3]=coh[3];
+   } else {
+     t->coh[4*t->M*ci+4*t->cid]+=coh[0];
+     t->coh[4*t->M*ci+4*t->cid+1]+=coh[1];
+     t->coh[4*t->M*ci+4*t->cid+2]+=coh[2];
+     t->coh[4*t->M*ci+4*t->cid+3]+=coh[3];
+   }
+  }
+
+  cudaDeviceSynchronize();
+
+  err=cudaFree(cohd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(factd);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  /* reset error state */
+  err=cudaGetLastError();
+
+  return NULL;
+}
+#endif /* HAVE_CUDA */
+/*****************************************************************************/
 static void *
 shapelet_prod_one_threadfn(void *data) {
   thread_data_stat_t *t=(thread_data_stat_t*)data;
@@ -124,6 +263,8 @@ shapelet_prod_one_threadfn(void *data) {
   for (int stat=0; stat<t->Nstat; stat++) {
      //shapelet_product_jones(sp->n0,sp->n0,sh_n0,sp->beta,sp->beta,sh_beta,&C_Jq[4*sp->n0*sp->n0*stat],s_coh,&Zt[4*G*stat],Cf,1);
      shapelet_product_jones(t->sL,t->sM,t->sN,t->alpha,t->beta,t->gamma,&(t->h_arr[4*t->sL*t->sL*(stat+t->off)]),t->f_arr,&(t->g_arr[4*t->sN*t->sN*(stat+t->off)]),t->Cf,t->hermitian);
+     // For debugging, C_Jq <= s_coh
+     //memcpy(&(t->h_arr[4*t->sL*t->sL*(stat+t->off)]),t->f_arr,t->sL*t->sL*4*sizeof(complex double));
   }
 
   return NULL;
@@ -139,6 +280,8 @@ shapelet_prod_two_threadfn(void *data) {
       for (int stat2=stat1+t->off; stat2<t->N; stat2++) {
           //shapelet_product_jones(sp->n0,sh_n0,sp->n0,sp->beta,sh_beta,sp->beta,&Jp_C_Jq[4*sp->n0*sp->n0*(stat1*N+stat2)],&Zt[4*G*stat1],&C_Jq[4*sp->n0*sp->n0*stat2],Cf,0);
           shapelet_product_jones(t->sL,t->sM,t->sN,t->alpha,t->beta,t->gamma,&t->h_arr[4*t->sL*t->sL*((stat1+t->off)*t->N+stat2)],&t->f_arr[4*t->sM*t->sM*(stat1+t->off)],&t->g_arr[4*t->sN*t->sN*stat2],t->Cf,t->hermitian);
+          // For debugging, Jp_C_Jq <= C_Jq
+          //memcpy(&t->h_arr[4*t->sL*t->sL*((stat1+t->off)*t->N+stat2)],&t->g_arr[4*t->sN*t->sN*stat2],t->sL*t->sL*4*sizeof(complex double));
       }
   }
   return NULL;
@@ -150,8 +293,7 @@ shapelet_prod_two_threadfn(void *data) {
  */
 int
 recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double *x, int N,
-   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int cid, int sh_n0, double sh_beta, complex double *Z, int Nt) {
-
+   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int cid, int sh_n0, double sh_beta, complex double *Z, int Nt, int use_cuda) {
 
       /* thread setup - divide baselines */
       int Nthb0,Nthb;
@@ -161,6 +303,13 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
       Nthb0=(Nbase+Nt-1)/Nt;
       pthread_attr_init(&attr);
       pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
+
+
+#ifdef HAVE_CUDA
+      taskhist thst;
+      init_task_hist(&thst);
+#endif /* HAVE_CUDA */
+
       if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
        fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
        exit(1);
@@ -249,6 +398,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
 
         /* get shapelet info */
         exinfo_shapelet *sp=(exinfo_shapelet*) carr[cid].ex[ci];
+        /* since beta -> beta*2*pi in model FT, divide back 
+         * to get scale in image space */
+        double beta_img=sp->beta/(2.0*M_PI);
 
         /* create tensor : product out (sp->n0,sp->beta),
          * product in (sp->n0,sp->beta) (sh_n0,sh_beta) */
@@ -256,7 +408,7 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
          fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
          exit(1);
         }
-        shapelet_product_tensor(sp->n0,sp->n0,sh_n0,sp->beta,sp->beta,sh_beta,Cf);
+        shapelet_product_tensor(sp->n0,sp->n0,sh_n0,beta_img,beta_img,sh_beta,Cf);
 
         /* allocate memory to store the product C J_q^H, 
          * n0*n0*2x2  for each station */
@@ -305,8 +457,8 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
           threaddata_stat[nth1].sL=sp->n0;
           threaddata_stat[nth1].sM=sp->n0;
           threaddata_stat[nth1].sN=sh_n0;
-          threaddata_stat[nth1].alpha=sp->beta;
-          threaddata_stat[nth1].beta=sp->beta;
+          threaddata_stat[nth1].alpha=beta_img;
+          threaddata_stat[nth1].beta=beta_img;;
           threaddata_stat[nth1].gamma=sh_beta;
         
           threaddata_stat[nth1].h_arr=C_Jq;
@@ -332,7 +484,7 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
          fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
          exit(1);
         }
-        shapelet_product_tensor(sp->n0,sh_n0,sp->n0,sp->beta,sh_beta,sp->beta,Cf);
+        shapelet_product_tensor(sp->n0,sh_n0,sp->n0,beta_img,sh_beta,beta_img,Cf);
 
         complex double *Jp_C_Jq=0;
         if ((Jp_C_Jq=(complex double*)calloc((size_t)(2*N*N*2*sp->n0*sp->n0),sizeof(complex double)))==0) {
@@ -355,9 +507,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
           threaddata_stat[nth1].sL=sp->n0;
           threaddata_stat[nth1].sM=sh_n0;
           threaddata_stat[nth1].sN=sp->n0;
-          threaddata_stat[nth1].alpha=sp->beta;
+          threaddata_stat[nth1].alpha=beta_img;
           threaddata_stat[nth1].beta=sh_beta;
-          threaddata_stat[nth1].gamma=sp->beta;
+          threaddata_stat[nth1].gamma=beta_img;
 
           threaddata_stat[nth1].h_arr=Jp_C_Jq;
           threaddata_stat[nth1].f_arr=Zt;
@@ -376,7 +528,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
         free(Cf);
         free(C_Jq);
 
-        /* predict visibilities */
+        /* predict visibilities - use threads only using CPU prediction
+         * otherwise, do sequential prediction */
+#ifndef HAVE_CUDA
         for (int nth1=0; nth1<nth; nth1++) {
           /* set the source id */
           threaddata[nth1].sid=ci;
@@ -388,15 +542,46 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double
         for (int nth1=0; nth1<nth; nth1++) {
           pthread_join(th_array[nth1],NULL);
         }
+#else /* HAVE_CUDA */
+        use_cuda=0;
+        if (!use_cuda) {
+         for (int nth1=0; nth1<nth; nth1++) {
+          /* set the source id */
+          threaddata[nth1].sid=ci;
+          threaddata[nth1].modes=Jp_C_Jq;
+          threaddata[nth1].modes_n0=sp->n0;
+          threaddata[nth1].modes_beta=sp->beta;
+          pthread_create(&th_array[nth1],&attr,shapelet_pred_threadfn,(void*)(&threaddata[nth1]));
+         }
+         for (int nth1=0; nth1<nth; nth1++) {
+          pthread_join(th_array[nth1],NULL);
+         }
+        } else {
+          for (int nth1=0; nth1<nth; nth1++) {
+            threaddata[nth1].sid=ci;
+            threaddata[nth1].modes=Jp_C_Jq;
+            threaddata[nth1].modes_n0=sp->n0;
+            threaddata[nth1].modes_beta=sp->beta;
+            threaddata[nth1].hst=&thst;
+            threaddata[nth1].tid=nth1;
+            shapelet_pred_threadfn_cuda((void*)&threaddata[nth1]);
+          }
+        }
+#endif  /* HAVE_CUDA */
 
         free(Jp_C_Jq);
       }
 
 
+#ifdef HAVE_CUDA
+      destroy_task_hist(&thst);
+#endif
+
       pthread_attr_destroy(&attr);
       free(th_array);
       free(threaddata);
       free(threaddata_stat);
       free(Zt);
-  return 0;
+
+     return 0;
 }
diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c
index 11366ed..59aaa69 100644
--- a/src/lib/Radio/elementbeam.c
+++ b/src/lib/Radio/elementbeam.c
@@ -616,7 +616,7 @@ sharmonic_modes(int n0,double *th, double *ph, int Nt, complex double *output) {
  }
  fact[0]=1.0;
  for (l=1; l<=(2*n0-1); l++) {
-    fact[l]=(l)*fact[l-1];
+    fact[l]=((double)l)*fact[l-1];
  }
 
 
diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu
index 8b8a79f..40e9e47 100644
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
@@ -95,7 +95,7 @@ __device__ float4
 eval_elementcoeff(float r, float theta, int M, float beta, const float2 *pattern_theta,
      const float2 *pattern_phi, const float *pattern_preamble) {
   float4 eval={0.f,0.f,0.f,0.f};
-  float rb=powf(r/beta,2);
+  float rb=powf(r/beta,2.0f);
   float ex=expf(-0.5f*rb);
 
   int idx=0;
@@ -541,77 +541,6 @@ H_e(float x, int n) {
   return Hn;
 }
 
-__device__ void
-calculate_uv_mode_vectors_scalar00(float u, float v, float beta, int n0, float *Av, int *cplx) {
-
-  int xci,zci,Ntot;
-
-  float **shpvl, *fact;
-  int n1,n2,start;
-  float xval;
-  int signval;
-
-  Ntot=2; /* u,v seperately */
-  /* set up factorial array */
-  fact=(float *)malloc((size_t)(n0)*sizeof(float));
-  fact[0]=1.0f;
-  for (xci=1; xci<(n0); xci++) {
-    fact[xci]=((float)xci)*fact[xci-1];
-  }
-
-  /* setup array to store calculated shapelet value */
-  /* need max storage Ntot x n0 */
-  shpvl=(float**)malloc((size_t)(Ntot)*sizeof(float*));
-  for (xci=0; xci<Ntot; xci++) {
-   shpvl[xci]=(float*)malloc((size_t)(n0)*sizeof(float));
-  }
-
-
-  /* start filling in the array from the positive values */
-  zci=0;
-  xval=u*beta;
-  float expval=__expf(-0.5f*(float)xval*xval);
-  for (xci=0; xci<n0; xci++) {
-    shpvl[zci][xci]=H_e(xval,xci)*expval/__fsqrt_rn((float)(2<<xci)*fact[xci]);
-  }
-  zci=1;
-  xval=v*beta;
-  expval=__expf(-0.5f*(float)xval*xval);
-  for (xci=0; xci<n0; xci++) {
-    shpvl[zci][xci]=H_e(xval,xci)*expval/__fsqrt_rn((float)(2<<xci)*fact[xci]);
-  }
-
-  /* now calculate the mode vectors */
-  /* each vector is 1 x 1 length and there are n0*n0 of them */
-
-  for (n2=0; n2<(n0); n2++) {
-   for (n1=0; n1<(n0); n1++) {
-    cplx[n2*n0+n1]=((n1+n2)%2==0?0:1) /* even (real) or odd (imaginary)*/;
-    /* sign */
-    if (cplx[n2*n0+n1]==0) {
-      signval=(((n1+n2)/2)%2==0?1:-1);
-    } else {
-      signval=(((n1+n2-1)/2)%2==0?1:-1);
-    }
-
-    /* fill in 1*1*(zci) to 1*1*(zci+1)-1 */
-    start=(n2*(n0)+n1);
-    if (signval==-1) {
-        Av[start]=-shpvl[0][n1]*shpvl[1][n2];
-    } else {
-        Av[start]=shpvl[0][n1]*shpvl[1][n2];
-    }
-   }
-  }
-
-  free(fact);
-  for (xci=0; xci<Ntot; xci++) {
-   free(shpvl[xci]);
-  }
-  free(shpvl);
-}
-
-
 __device__ void
 calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av, int *cplx, float *fact, float *shpvl) {
 
@@ -627,9 +556,9 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av
   float xvalv=v*beta;
   float expvalv=__expf(-0.5f*xvalv*xvalv);
   for (xci=0; xci<n0; xci++) {
-    shpvl[xci]=H_e(xvalu,xci)*expvalu/__fsqrt_rn((float)(2<<xci)*fact[xci]);
+      shpvl[xci]=H_e(xvalu,xci)*expvalu/__fsqrt_rn(powf(2.0f,(float)xci+1)*fact[xci]);
 
-    shpvl[xci+n0]=H_e(xvalv,xci)*expvalv/__fsqrt_rn((float)(2<<xci)*fact[xci]);
+      shpvl[xci+n0]=H_e(xvalv,xci)*expvalv/__fsqrt_rn(powf(2.0f,(float)xci+1)*fact[xci]);
   }
 
   /* now calculate the mode vectors */
@@ -658,6 +587,7 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av
 }
 
 
+#define SMALLEST_SPATIAL_SCALE_FAC 100.0f
 
 __device__ cuDoubleComplex
 shapelet_contrib__(int *dd, float u, float v, float w) {
@@ -685,8 +615,9 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
   ut=a*(cosph*up-sinph*vp);
   vt=b*(sinph*up+cosph*vp);
   /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
-     so check this here and return 0, otherwise spurious nans may result */
-  if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
+     so check this here and return 0, otherwise spurious nans may result,
+   i.e., predict only for spatial scales l,m > beta * scale_factor ~ beta * 0.01 */
+  if (__fdiv_rz(SMALLEST_SPATIAL_SCALE_FAC,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
    return make_cuDoubleComplex(0.0,0.0);
   }
   /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
@@ -1539,6 +1470,139 @@ kernel_convert_time(int T, double *time_utc) {
 
 }
 
+
+__global__ void
+kernel_fns_shapelet_coh(float u, float v, const float *__restrict__ modes, const float *__restrict__ fact, int n0, float beta, float *J_C_J) {
+  extern __shared__ float Jprod[]; /* 8*threads shared mem per block */
+  /* global thread index : equal to the mode [0,n0*n0-1]*/
+  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
+  int tid = threadIdx.x;
+
+  /* separate mode to n1,n2 */
+  unsigned int n1=n%n0;
+  unsigned int n2=n/n0;
+  float uu=u*beta;
+  float vv=v*beta;
+
+  int val_finite=__fdiv_rz(SMALLEST_SPATIAL_SCALE_FAC,__fsqrt_rz(uu*uu+vv*vv))>beta?1:0; 
+  if (val_finite && n<n0*n0 && n1<n0 && n2<n0) {
+   float basis=H_e(uu,n1)/__fsqrt_rn(powf(2.0f,(float)n1+1)*fact[n1])*__expf(-0.5f*uu*uu)
+      *H_e(vv,n2)/__fsqrt_rn(powf(2.0f,(float)n2+1)*fact[n2])*__expf(-0.5f*vv*vv);
+
+   if((n1+n2)%2==0)  {/* even (basis is real) or odd (basis is imaginary)*/;
+    /* multiply 8 values of modes[] */
+    Jprod[8*tid]=basis*modes[8*n];
+    Jprod[8*tid+1]=basis*modes[8*n+1];
+    Jprod[8*tid+2]=basis*modes[8*n+2];
+    Jprod[8*tid+3]=basis*modes[8*n+3];
+    Jprod[8*tid+4]=basis*modes[8*n+4];
+    Jprod[8*tid+5]=basis*modes[8*n+5];
+    Jprod[8*tid+6]=basis*modes[8*n+6];
+    Jprod[8*tid+7]=basis*modes[8*n+7];
+   } else {
+    Jprod[8*tid+1]=basis*modes[8*n];
+    Jprod[8*tid]=-basis*modes[8*n+1];
+    Jprod[8*tid+3]=basis*modes[8*n+2];
+    Jprod[8*tid+2]=-basis*modes[8*n+3];
+    Jprod[8*tid+5]=basis*modes[8*n+4];
+    Jprod[8*tid+4]=-basis*modes[8*n+5];
+    Jprod[8*tid+7]=basis*modes[8*n+6];
+    Jprod[8*tid+6]=-basis*modes[8*n+7];
+   }
+  } else {
+   Jprod[8*tid]=Jprod[8*tid+1]=Jprod[8*tid+2]=Jprod[8*tid+3]=
+    Jprod[8*tid+4]=Jprod[8*tid+5]=Jprod[8*tid+6]=Jprod[8*tid+7]=0.0f;
+  }
+  __syncthreads();
+  // Build summation tree over elements, assuming blockDim.x is power of 2.
+  for(int s=blockDim.x/2; s>0; s=s/2) {
+    if(tid < s) {
+      Jprod[8*tid] += Jprod[8*(tid + s)];
+      Jprod[8*tid+1] += Jprod[8*(tid + s)+1];
+      Jprod[8*tid+2] += Jprod[8*(tid + s)+2];
+      Jprod[8*tid+3] += Jprod[8*(tid + s)+3];
+      Jprod[8*tid+4] += Jprod[8*(tid + s)+4];
+      Jprod[8*tid+5] += Jprod[8*(tid + s)+5];
+      Jprod[8*tid+6] += Jprod[8*(tid + s)+6];
+      Jprod[8*tid+7] += Jprod[8*(tid + s)+7];
+    }
+   __syncthreads();
+  }
+
+  /* copy back the sum to proper location in ed */
+  if(tid==0) {
+   J_C_J[8*blockIdx.x]=Jprod[0];
+   J_C_J[8*blockIdx.x+1]=Jprod[1];
+   J_C_J[8*blockIdx.x+2]=Jprod[2];
+   J_C_J[8*blockIdx.x+3]=Jprod[3];
+   J_C_J[8*blockIdx.x+4]=Jprod[4];
+   J_C_J[8*blockIdx.x+5]=Jprod[5];
+   J_C_J[8*blockIdx.x+6]=Jprod[6];
+   J_C_J[8*blockIdx.x+7]=Jprod[7];
+  }
+}
+
+
+__global__ void
+plus_reduce(const float *__restrict__ input, int N, int blockDim_2, double *coh) {
+ // Each block loads its 8 elements into shared memory
+ extern __shared__ float x[];
+ int tid = threadIdx.x;
+ int i = blockIdx.x*blockDim.x + threadIdx.x;
+ if (i<N) {
+  x[8*tid] =input[8*i];
+  x[8*tid+1] =input[8*i+1];
+  x[8*tid+2] =input[8*i+2];
+  x[8*tid+3] =input[8*i+3];
+  x[8*tid+4] =input[8*i+4];
+  x[8*tid+5] =input[8*i+5];
+  x[8*tid+6] =input[8*i+6];
+  x[8*tid+7] =input[8*i+7];
+ } else {
+  x[8*tid] =0.0f;
+  x[8*tid+1] =0.0f;
+  x[8*tid+2] =0.0f;
+  x[8*tid+3] =0.0f;
+  x[8*tid+4] =0.0f;
+  x[8*tid+5] =0.0f;
+  x[8*tid+6] =0.0f;
+  x[8*tid+7] =0.0f;
+ }
+ __syncthreads();
+ // Build summation tree over elements, handling case where B is not a power of two.
+  int nTotalThreads = blockDim_2; // Total number of threads, rounded up to the next power of two
+  while(nTotalThreads > 1) {
+   int halfPoint = (nTotalThreads >> 1); // divide by two
+    if (tid < halfPoint) {
+     int thread2 = tid + halfPoint;
+     if (thread2 < blockDim.x) { // Skipping the fictitious threads blockDim.x ... blockDim_2-1
+      x[8*tid] = x[8*tid]+x[8*thread2];
+      x[8*tid+1] = x[8*tid+1]+x[8*thread2+1];
+      x[8*tid+2] = x[8*tid+2]+x[8*thread2+2];
+      x[8*tid+3] = x[8*tid+3]+x[8*thread2+3];
+      x[8*tid+4] = x[8*tid+4]+x[8*thread2+4];
+      x[8*tid+5] = x[8*tid+5]+x[8*thread2+5];
+      x[8*tid+6] = x[8*tid+6]+x[8*thread2+6];
+      x[8*tid+7] = x[8*tid+7]+x[8*thread2+7];
+     }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint; // Reducing the binary tree size by two
+ }
+
+ /* add back to total */
+ if( tid == 0 ) {
+  coh[0]=(double)x[tid];
+  coh[1]=(double)x[tid+1];
+  coh[2]=(double)x[tid+2];
+  coh[3]=(double)x[tid+3];
+  coh[4]=(double)x[tid+4];
+  coh[5]=(double)x[tid+5];
+  coh[6]=(double)x[tid+6];
+  coh[7]=(double)x[tid+7];
+ }
+}
+
 /* only use extern if calling code is C */
 extern "C"
 {
@@ -1822,4 +1886,57 @@ cudakernel_convert_time(int T, double *time_utc) {
 }
 
 
+/* need power of 2 for tree reduction to work */
+static int
+NearestPowerOf2 (int n){
+  if (!n) return n;  //(0 == 2^0)
+
+  int x = 1;
+  while(x < n) {
+      x <<= 1;
+  }
+  return x;
+}
+
+#define CUDA_DBG
+/* calculate visibilites for shapelet model at u,v,
+modes: (device memory) n0*n0*(2x2)x2 double, n0*n0*(2x2) complex double
+fact: (device memory) n0 factorial array
+coh: (device memory) 8x1 double, 4x1 complex double
+*/
+void
+cudakernel_calculate_shapelet_coherencies(float u, float v, float *modes, float *fact, int n0, float beta, double *coh) {
+
+#ifdef CUDA_DBG
+  cudaError_t error;
+  error = cudaGetLastError();
+#endif
+
+  /* split n0*n0 modes into threads */
+  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  int BlocksPerGrid=(n0*n0+ThreadsPerBlock-1)/ThreadsPerBlock;
+  /* each thread computes basis for that mode (n1,n2), finds the product
+     of basis with modes (2x2 complex),
+     thereafter, summation over each block, and result written back to global mem */
+
+  /* global mem to store summation per block */
+  float *J_C_J;
+  cudaMalloc((void**)&J_C_J, 8*sizeof(float)*BlocksPerGrid);
+  cudaMemset(J_C_J, 0, 8*sizeof(float)*BlocksPerGrid);
+  /* shared mem: 8*ThreadsPerBlock */
+  kernel_fns_shapelet_coh<<< BlocksPerGrid, ThreadsPerBlock, 8*sizeof(float)*ThreadsPerBlock >>>(-u, v, modes, fact, n0, beta, J_C_J);
+
+  /* launch 1 block, threads=BlocksPerGrid */
+  plus_reduce<<< 1, BlocksPerGrid, 8*sizeof(float)*BlocksPerGrid>>>(J_C_J, BlocksPerGrid, NearestPowerOf2(BlocksPerGrid), coh);
+  
+  cudaFree(J_C_J);
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+    exit(-1);
+  }
+#endif
+}
 } /* extern "C" */
diff --git a/src/lib/Radio/predict_withbeam_cuda.c b/src/lib/Radio/predict_withbeam_cuda.c
index e56a445..c5d6151 100644
--- a/src/lib/Radio/predict_withbeam_cuda.c
+++ b/src/lib/Radio/predict_withbeam_cuda.c
@@ -135,7 +135,7 @@ typedef struct thread_data_corr_t_ {
 
 /* copy Nx1 double array x to device as float
    first allocate device memory */
-static void
+void
 dtofcopy(int N, float **x_d, double *x) {
   float *xhost=0;
   cudaError_t err;
@@ -166,7 +166,6 @@ dtofcopy(int N, float **x_d, double *x) {
   checkCudaError(err,__FILE__,__LINE__);
 
   *x_d=xc;
-
 }
 
 static void *
diff --git a/src/lib/Radio/shapelet.c b/src/lib/Radio/shapelet.c
index 1d79f8d..dc6a858 100644
--- a/src/lib/Radio/shapelet.c
+++ b/src/lib/Radio/shapelet.c
@@ -63,7 +63,7 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double
 	}
   fact[0]=1.0;
 	for (xci=1; xci<(n0); xci++) {
-		fact[xci]=(xci)*fact[xci-1];
+		fact[xci]=((double)xci)*fact[xci-1];
 	}
 
 	/* setup array to store calculated shapelet value */
@@ -85,13 +85,13 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double
   xval=u*beta;
   double expval=exp(-0.5*xval*xval);
 	for (xci=0; xci<n0; xci++) {
-		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
+		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt(pow(2.0,(double)xci+1)*fact[xci]);
 	}
 	zci=1;
   xval=v*beta;
   expval=exp(-0.5*xval*xval);
 	for (xci=0; xci<n0; xci++) {
-		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
+		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt(pow(2.0,(double)xci+1)*fact[xci]);
 	}
 
 
@@ -425,7 +425,7 @@ shapelet_modes(int n0,double beta, double *x, double *y, int N, complex double *
 	}
   fact[0]=1;
 	for (xci=1; xci<(n0); xci++) {
-		fact[xci]=(xci)*fact[xci-1];
+		fact[xci]=((double)xci)*fact[xci-1];
 	}
 
 #ifdef DEBUG
@@ -462,7 +462,7 @@ shapelet_modes(int n0,double beta, double *x, double *y, int N, complex double *
 				/*take into account the scaling
 				*/
 				 double xvalt=grid[zci]/(beta);
-				 shpvl[zci][xci]=H_e(xvalt,xci)*exp(-0.5*xvalt*xvalt)/sqrt((2<<xci)*fact[xci]);
+				 shpvl[zci][xci]=H_e(xvalt,xci)*exp(-0.5*xvalt*xvalt)/sqrt(pow(2.0,(double)xci+1)*fact[xci]);
 		   }
 		 }
 	}
@@ -658,7 +658,7 @@ shapelet_product_tensor(int L, int M, int N, double alpha, double beta, double g
 	}
   fact[0]=1.0;
 	for (int ci=1; ci<(n0); ci++) {
-		fact[ci]=(ci)*fact[ci-1];
+		fact[ci]=((double)ci)*fact[ci-1];
 	}
 
   /* B_lmn(a,b,c) = nu/sqrt(-2^{l+m+n} sqrt(pi) l! m! n! a b c) L_lmn(sqrt(2)nu/a, sqrt(2)nu/b, sqrt(2)nu/c)
@@ -669,7 +669,7 @@ shapelet_product_tensor(int L, int M, int N, double alpha, double beta, double g
         /* only even l+m+n are non zero, so -2^{l+m+n}=2^{l+m+n} */
         if ((l+m+n)%2 ==0) {
          /* note that we use column major order in B, row m and col n = m+nM, NOT mN+n */
-         B[l*M*N+m+n*M]=nu/sqrt((double)(1<<(l+m+n))*sqrt(M_PI)* fact[l]*fact[m]*fact[n]*alpha*beta*gamma) * H[l*M*N+m*N+n];
+         B[l*M*N+m+n*M]=nu*( H[l*M*N+m*N+n]/sqrt((double)(pow(2.0,(double)(l+m+n)))*sqrt(M_PI)* fact[l]*fact[m]*fact[n]*alpha*beta*gamma) );
         } else {
          B[l*M*N+m+n*M]=0.0;
         }
@@ -679,7 +679,8 @@ shapelet_product_tensor(int L, int M, int N, double alpha, double beta, double g
 
   /* Normalize B to account for the very large values expected in H */
   double Bnorm=my_dnrm2(L*M*N,B);
-  my_dscal(L*M*N,1.0/Bnorm,B);
+  /* numerator (L*M*N)^(1/8) */
+  my_dscal(L*M*N,pow((double)L*M*N,0.125)/Bnorm,B);
 
   free(H);
   free(fact);
@@ -741,21 +742,33 @@ int
 shapelet_product(int L, int M, int N, double alpha, double beta, double gamma,
     double *h, double *f, double *g, double *C) {
 
+  printf("alpha=%lf;\n",alpha);
+  printf("beta=%lf;\n",beta);
+  printf("gamma=%lf;\n",gamma);
+  printf("L=%d;\n",L);
+  printf("M=%d;\n",M);
+  printf("N=%d;\n",N);
+  printf("f=[\n");
   for (int m=0; m<M*M; m++) {
-    printf("f %d %lf\n",m,f[m]);
+    printf("%lf\n",f[m]);
   }
+  printf("]; f=reshape(f,%d,%d);\n",M,M);
+  printf("g=[\n");
   for (int n=0; n<N*N; n++) {
-    printf("g %d %lf\n",n,g[n]);
+    printf("%lf\n",g[n]);
   }
+  printf("]; g=reshape(g,%d,%d);\n",N,N);
+  printf("Cf=zeros(%d,%d,%d);\n",L,M,N);
   for (int l=0; l<L; l++) {
-    printf("%d =\n",l);
+    printf("Cf(%d,:,:)=[\n",l+1);
     for (int m=0; m<M; m++) {
       for (int n=0; n<N; n++) {
          /* column major order */
-         printf("%lf ",C[l*M*N+m+n*M]);
+         printf("%le ",C[l*M*N+m+n*M]);
       }
       printf("\n");
     }
+    printf("];\n");
   }
 
   /* find f x g^T : M^2 x N^2 matrix, stored as M^2*N^2 vector */
@@ -769,13 +782,13 @@ shapelet_product(int L, int M, int N, double alpha, double beta, double gamma,
     my_dscal(M*M,g[ci],&fg[ci*M*M]);
   }
 
-  printf("fg=\n");
+  printf("fg=[\n");
   for (int ci=0; ci<M*M; ci++) {
     for (int cj=0; cj<N*N; cj++) {
-      printf("%lf ",fg[ci+cj*M*M]);
+      printf("%lf\n",fg[ci+cj*M*M]);
     }
-    printf("\n");
   }
+  printf("];\n");
 
   /* storage to find kronecker product */
   double *Cl;
@@ -783,10 +796,10 @@ shapelet_product(int L, int M, int N, double alpha, double beta, double gamma,
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
   }
-  for (int l1=0; l1<L; l1++) {
-    double *Cl1=&C[l1*M*N]; //C[l1,:,:] MxN matrix
-    for (int l2=0; l2<L; l2++) {
-      double *Cl2=&C[l2*M*N]; //C[l2,:,:] MxN matrix
+  for (int l2=0; l2<L; l2++) {
+    double *Cl2=&C[l2*M*N]; //C[l2,:,:] MxN matrix
+    for (int l1=0; l1<L; l1++) {
+      double *Cl1=&C[l1*M*N]; //C[l1,:,:] MxN matrix
       /* find kronecker product Cl=kron(Cl2,Cl1) */
       kronecker_prod(M,N,Cl2,Cl1,Cl);
       /* Hadamard prod Cl . fg and sum */
@@ -795,14 +808,15 @@ shapelet_product(int L, int M, int N, double alpha, double beta, double gamma,
       for (int ci=0; ci<M*M*N*N; ci++) {
         sum+=Cl[ci]*fg[ci];
       }
-      printf("h(%d,%d) %lf\n",l1,l2,sum);
       h[l1+l2*L]=sum;
     }
   }
 
+  printf("h=[\n");
   for (int l=0; l<L*L; l++) {
-    printf("h %d %lf\n",l,h[l]);
+    printf("%lf\n",h[l]);
   }
+  printf("]; h=reshape(h,%d,%d);\n",L,L);
 
   free(fg);
   free(Cl);
@@ -848,7 +862,7 @@ ambt(complex double * __restrict a, complex double * __restrict b, complex doubl
  */
 int
 shapelet_product_jones(int L, int M, int N, double alpha, double beta, double gamma,
-    complex double *h, complex double *f, complex double *g, double *C, int hermitian) {
+    complex double *__restrict h, complex double *__restrict f, complex double *__restrict g, double *__restrict C, int hermitian) {
 
   //printf("Input h %dx%d %lf f %dx%d %lf g %dx%d %lf (x 4)\n",L,L,alpha,M,M,beta,N,N,gamma);
 #ifdef DEBUG
@@ -904,10 +918,10 @@ shapelet_product_jones(int L, int M, int N, double alpha, double beta, double ga
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
   }
-  for (int l1=0; l1<L; l1++) {
-    double *Cl1=&C[l1*M*N]; //C[l1,:,:] MxN matrix
-    for (int l2=0; l2<L; l2++) {
-      double *Cl2=&C[l2*M*N]; //C[l2,:,:] MxN matrix
+  for (int l2=0; l2<L; l2++) {
+    double *Cl2=&C[l2*M*N]; //C[l2,:,:] MxN matrix
+    for (int l1=0; l1<L; l1++) {
+      double *Cl1=&C[l1*M*N]; //C[l1,:,:] MxN matrix
       /* find kronecker product Cl=kron(Cl2,Cl1) */
       kronecker_prod(M,N,Cl2,Cl1,Cl);
       /* Hadamard prod Cl . (f x g)  and sum, fxg=J_f * J_g^H, 2x2 product */
@@ -936,7 +950,7 @@ shapelet_product_jones(int L, int M, int N, double alpha, double beta, double ga
   }
 #endif
 
-  //printf("Norms Cl:%e f:%e g:%e h:%e\n",my_dnrm2(M*M*N*N,Cl),my_dnrm2(4*M*M,(double*)f),my_dnrm2(4*N*N,(double*)g),my_dnrm2(4*L*L,(double*)h));
+  //printf("Norms Cl:%e f:%e g:%e h:%e\n",my_dnrm2(M*M*N*N,Cl),my_dnrm2(8*M*M,(double*)f),my_dnrm2(8*N*N,(double*)g),my_dnrm2(8*L*L,(double*)h));
   free(fg);
   free(Cl);
   return 0;
diff --git a/src/restore/shapelet_lm.c b/src/restore/shapelet_lm.c
index f70c36e..fd6dfc0 100644
--- a/src/restore/shapelet_lm.c
+++ b/src/restore/shapelet_lm.c
@@ -227,7 +227,7 @@ calculate_mode_vectors(double *x, int Nx, double *y, int Ny, int *M, double
 	}
   fact[0]=1.0;
 	for (xci=1; xci<(*n0); xci++) {
-		fact[xci]=(xci)*fact[xci-1];
+		fact[xci]=((double)xci)*fact[xci-1];
 	}
 
 #ifdef DEBUG
@@ -265,7 +265,7 @@ calculate_mode_vectors(double *x, int Nx, double *y, int Ny, int *M, double
 				*/
 				 xval=grid[zci]/(*beta);
 				 //shpvl[zci][xci]=inv_beta*pi_4r*H_e(xval,xci)*exp(-0.5*xval*xval)/sqrt((2<<xci)*fact[xci]);
-				 shpvl[zci][xci]=H_e(xval,xci)*exp(-0.5*xval*xval)/sqrt((double)(2<<xci)*fact[xci]);
+				 shpvl[zci][xci]=H_e(xval,xci)*exp(-0.5*xval*xval)/sqrt(pow(2.0,(double)xci+1)*fact[xci]);
 		   }
 		 }
 	}
@@ -512,7 +512,7 @@ calculate_mode_vectors_bi(double *x, double *y, int N,  double beta, int n0, dou
 	}
   fact[0]=1;
 	for (xci=1; xci<(n0); xci++) {
-		fact[xci]=(xci)*fact[xci-1];
+		fact[xci]=((double)xci)*fact[xci-1];
 	}
 
 #ifdef DEBUG
@@ -554,7 +554,7 @@ calculate_mode_vectors_bi(double *x, double *y, int N,  double beta, int n0, dou
 				*/
 				 double xvalt=grid[zci]/(beta);
 				 //shpvl[zci][xci]=inv_beta*pi_4r*H_e(xval,xci)*exp(-0.5*xval*xval)/sqrt((2<<xci)*fact[xci]);
-				 shpvl[zci][xci]=H_e(xvalt,xci)*exp(-0.5*xvalt*xvalt)/sqrt((2<<xci)*fact[xci]);
+				 shpvl[zci][xci]=H_e(xvalt,xci)*exp(-0.5*xvalt*xvalt)/sqrt(pow(2.0,(double)xci+1)*fact[xci]);
 		   }
 		 }
 	}