diff --git a/README.md b/README.md index 908c347..bec8d06 100644 --- a/README.md +++ b/README.md @@ -200,6 +200,9 @@ Spatial regularization (with distributed multi-directional calibration) enables After each solution, images showing the spatial model (amplitude) will be created as ```.PPM``` files. +#### 5a) Diffuse sky models in calibration +When a spatial model is enabled, it is also possible to apply the spatial model onto a model of a diffuse sky background. The diffuse sky model can have any number of shapelet components. They all should belong to one cluster (say cluster *45*). With ```-D``` option, it is possible to enable the use of diffuse sky model with the spatial model, like ```-D 45,0.1```, where *0.1* is the regularization factor used while applying the spatial model. The larger this regularization is, the more strongly the spatial model will be enforced onto the diffuse sky model. + ### 6) Solution format All SAGECal solutions are stored as text files. Lines starting with '#' are comments. The first non-comment line includes some general information, i.e. diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp index 218bfd2..eb153b5 100644 --- a/src/MPI/main.cpp +++ b/src/MPI/main.cpp @@ -34,7 +34,7 @@ using namespace Data; void print_copyright(void) { - cout<<"SAGECal-MPI 0.8.2 (C) 2011-2024 Sarod Yatawatta"< debug_vec(mymscount); + if (Data::spatialreg && sp_diffuse_id>=0) { + for(int cm=0; cm p_vec(mymscount); vector pm_vec(mymscount); @@ -657,7 +671,28 @@ cout<=Nadmm-Data::admm_cadence) { + for (int nb=0; nb #include "Dirac.h" +#define FISTA_L_MIN 1e2 +#define FISTA_L_MAX 1e7 /* * Z = arg min \| Z_k - Z Phi_k\|^2 + \lambda \|Z\|^2 + \mu \|Z\|_1 @@ -42,6 +44,11 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double complex double *Zold,*Y; /* Lipschitz constant of gradient, use ||Phikk||^2 as estimate */ double L=my_cdot(2*G*2*G,Phikk,Phikk); + /* if 1/L too large, might diverge, so catch it */ + if (LFISTA_L_MAX) { L=FISTA_L_MAX; } + /* intial t */ double t=1.0; if ((gradf=(complex double*)calloc((size_t)2*Npoly*N*2*G,sizeof(complex double)))==0) { @@ -75,7 +82,7 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double /* take gradient descent step Y - 1/L gradf */ my_caxpy(2*Npoly*N*2*G, gradf, -1.0/L, Y); /* soft threshold and update Z */ - double thresh=t*mu; + double thresh=mu/L; for (int ci=0; ci<2*Npoly*N*2*G; ci++) { double r=creal(Y[ci]); double r1=fabs(r)-thresh; @@ -90,12 +97,13 @@ update_spatialreg_fista(complex double *Z, complex double *Zbar, complex double } double t0=t; t=(1.0+sqrt(1.0+4.0*t*t))*0.5; - /* update Y = Z + (t-1)/told (Z-Zold) = (1+(t-1)/told) Z - (t-1)/told Zold */ + /* Zold <= Zold-Z */ + my_caxpy(2*Npoly*N*2*G, Z, -1.0, Zold); + printf("FISTA %d ||grad||=%lf ||Z-Zold||=%lf\n",it,my_dnrm2(2*2*Npoly*N*2*G,(double*)gradf),my_dnrm2(2*2*Npoly*N*2*G,(double*)Zold)/my_dnrm2(2*2*Npoly*N*2*G,(double*)Z)); + /* update Y = Z + (told-1)/t(Z-Zold) */ memcpy(Y,Z,2*Npoly*N*2*G*sizeof(complex double)); - double scalefac=(t-1.0)/t0; - my_cscal(2*Npoly*N*2*G,1.0+scalefac,Y); + double scalefac=(t0-1.0)/t; my_caxpy(2*Npoly*N*2*G, Zold, -scalefac, Y); - //printf("%lf %lf %lf %lf %lf\n",t,creal(Y[10]),cimag(Y[10]),creal(Z[10]),cimag(Z[10])); } free(gradf); @@ -130,6 +138,11 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z complex double *Zold,*Y; /* Lipschitz constant of gradient, use ||Phikk||^2 as estimate */ double L=my_cdot(2*G*2*G,Phikk,Phikk); + /* if 1/L too large, might diverge, so catch it */ + if (LFISTA_L_MAX) { L=FISTA_L_MAX; } + /* intial t */ double t=1.0; if ((gradf=(complex double*)calloc((size_t)2*Npoly*N*2*G,sizeof(complex double)))==0) { @@ -173,7 +186,7 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z /* take gradient descent step Y - 1/L gradf */ my_caxpy(2*Npoly*N*2*G, gradf, -1.0/L, Y); /* soft threshold and update Z */ - double thresh=t*mu; + double thresh=mu/L; for (int ci=0; ci<2*Npoly*N*2*G; ci++) { double r=creal(Y[ci]); double r1=fabs(r)-thresh; @@ -188,12 +201,13 @@ update_spatialreg_fista_with_diffconstraint(complex double *Z, complex double *Z } double t0=t; t=(1.0+sqrt(1.0+4.0*t*t))*0.5; - /* update Y = Z + (t-1)/told (Z-Zold) = (1+(t-1)/told) Z - (t-1)/told Zold */ + /* Zold=Z-Zold */ + my_caxpy(2*Npoly*N*2*G, Z, -1.0, Zold); + printf("FISTA %d ||grad||=%lf ||Z-Zold||=%lf\n",it,my_dnrm2(2*2*Npoly*N*2*G,(double*)gradf),my_dnrm2(2*2*Npoly*N*2*G,(double*)Zold)/my_dnrm2(2*2*Npoly*N*2*G,(double*)Z)); + /* update Y = Z + (told-1)/t(Z-Zold) */ memcpy(Y,Z,2*Npoly*N*2*G*sizeof(complex double)); - double scalefac=(t-1.0)/t0; - my_cscal(2*Npoly*N*2*G,1.0+scalefac,Y); + double scalefac=(t0-1.0)/t; my_caxpy(2*Npoly*N*2*G, Zold, -scalefac, Y); - //printf("%lf %lf %lf %lf %lf\n",t,creal(Y[10]),cimag(Y[10]),creal(Z[10]),cimag(Z[10])); } free(gradf); diff --git a/src/lib/Dirac/myblas.c b/src/lib/Dirac/myblas.c index 38603cd..ec81fe2 100644 --- a/src/lib/Dirac/myblas.c +++ b/src/lib/Dirac/myblas.c @@ -61,6 +61,11 @@ __attribute__ ((target(MIC))) dscal_(&N,&a,x,&i); } void +my_dscal_inc(int N, double a, double *x, int inc) { + extern void dscal_(int *N, double *alpha, double *x, int *incx); + dscal_(&N,&a,x,&inc); +} +void my_sscal(int N, float a, float *x) { #ifdef USE_MIC __attribute__ ((target(MIC))) @@ -91,6 +96,14 @@ __attribute__ ((target(MIC))) int i=1; return(dnrm2_(&N,x,&i)); } +double +my_dnrm2_inc(int N, double *x, int inc) { +#ifdef USE_MIC +__attribute__ ((target(MIC))) +#endif + extern double dnrm2_(int *N, double *x, int *incx); + return(dnrm2_(&N,x,&inc)); +} float my_fnrm2(int N, float *x) { #ifdef USE_MIC diff --git a/src/lib/Radio/Dirac_radio.h b/src/lib/Radio/Dirac_radio.h index 36433cf..92002a2 100644 --- a/src/lib/Radio/Dirac_radio.h +++ b/src/lib/Radio/Dirac_radio.h @@ -223,9 +223,10 @@ precalculate_coherencies_multifreq(double *u, double *v, double *w, complex doub /****************************** diffuse_predict.c ****************************/ +/* have_cuda: if 1, use GPU version, else only CPU version */ extern int recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double *x, int N, - int Nbase, baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int diffuse_cluster, int sh_n0, double sh_beta, complex double *Z, int Nt); + int Nbase, baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int diffuse_cluster, int sh_n0, double sh_beta, complex double *Z, int Nt, int use_cuda); /****************************** transforms.c ****************************/ #ifndef ASEC2RAD #define ASEC2RAD 4.848136811095359935899141e-6 @@ -477,6 +478,11 @@ precess_source_locations_deprecated(double jd_tdb, clus_source_t *carr, int M, d /****************************** predict_withbeam_cuda.c ****************************/ #ifdef HAVE_CUDA +/* copy Nx1 double array x to device as float + first allocate device memory (need to be freed later) */ +extern void +dtofcopy(int N, float **x_d, double *x); + /* if dobeam==0, beam calculation is off else, flag to determine if full (element+array), array only, or element only beam is calculated */ @@ -544,6 +550,9 @@ cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, extern void cudakernel_convert_time(int T, double *time_utc); + +extern void +cudakernel_calculate_shapelet_coherencies(float u, float v, float *modes, float *fact, int n0, float beta, double *coh); #endif /* !HAVE_CUDA */ diff --git a/src/lib/Radio/diffuse_predict.c b/src/lib/Radio/diffuse_predict.c index ec11ad2..95ad9a4 100644 --- a/src/lib/Radio/diffuse_predict.c +++ b/src/lib/Radio/diffuse_predict.c @@ -24,6 +24,9 @@ #include "Dirac.h" #include "Dirac_radio.h" +#ifdef HAVE_CUDA +#include "Dirac_GPUtune.h" +#endif #include typedef struct thread_data_shap_ { @@ -45,6 +48,11 @@ typedef struct thread_data_shap_ { complex double *modes; /* all modes, 4*n0*n0*stations^2 (only half is calculated) */ int modes_n0; /* basis n0xn0 */ double modes_beta; /* scale */ + +#ifdef HAVE_CUDA + int tid; /* this thread id */ + taskhist *hst; /* for load balancing GPUs */ +#endif /* HAVE_CUDA */ } thread_data_shap_t; @@ -117,6 +125,137 @@ shapelet_pred_threadfn(void *data) { return NULL; } +/*****************************************************************************/ +#ifdef HAVE_CUDA +#define CUDA_DEBUG +static void +checkCudaError(cudaError_t err, char *file, int line) +{ +#ifdef CUDA_DEBUG + if(!err) + return; + fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line); + exit(EXIT_FAILURE); +#endif +} + +static void * +shapelet_pred_threadfn_cuda(void *data) { + thread_data_shap_t *t=(thread_data_shap_t*)data; + + /* kernel will spawn threads to cover baselines t->Nb, + * which can be lower than the total baselines */ + int card; + card=select_work_gpu(MAX_GPU_ID,t->hst); + + cudaError_t err; + err=cudaSetDevice(card); + checkCudaError(err,__FILE__,__LINE__); + + /* allocate device mem */ + float *modesd; /* shapelet modes 4*n0*n0 complex (float) values */ + double *cohd; /* coherencies 8x1 */ + float *factd; + + err=cudaMalloc((void**) &cohd, 8*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &factd, t->modes_n0*sizeof(float)); + checkCudaError(err,__FILE__,__LINE__); + + complex double coh[4]; + int cm=t->cid; + int sid=t->sid; + double freq0=t->freq0; + + double fdelta2=t->fdelta*0.5; + + /* set up factorial array */ + float *fact; + if ((fact=(float *)calloc((size_t)(t->modes_n0),sizeof(float)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + fact[0]=1.0f; + for (int ci=1; ci<(t->modes_n0); ci++) { + fact[ci]=((float)ci)*fact[ci-1]; + } + err=cudaMemcpy(factd, fact, t->modes_n0*sizeof(float), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + free(fact); + + /* we only predict for cluster id cm, and for only one source sid, + * so source flux and l,m,n coords are scalars */ + /* we copy u,v,w,l,m,n values to GPU and perform calculation per-baseline, + * CUDA threads parallelize over the modes : n0xn0 ~ large value */ + for (int ci=0; ciNb; ci++) { + int stat1=t->barr[ci+t->boff].sta1; + int stat2=t->barr[ci+t->boff].sta2; + + double Gn=2.0*M_PI*(t->u[ci]*t->carr[cm].ll[sid]+t->v[ci]*t->carr[cm].mm[sid]+t->w[ci]*t->carr[cm].nn[sid]); + double sin_n,cos_n; + sincos(freq0*Gn,&sin_n,&cos_n); + /* freq smearing */ + if (Gn!=0.0) { + double smfac=Gn*fdelta2; + Gn =fabs(sin(smfac)/smfac); + } else { + Gn =1.0; + } + /* multiply (re, im) phase term with smear factor */ + sin_n*=Gn; + cos_n*=Gn; + + /* shapelet contribution */ + /* modes: n0*n0*4 values &Jp_C_Jq[4*n0*n0*(stat1*t->N+stat2)] */ + complex double *modes=&(t->modes[4*t->modes_n0*t->modes_n0*(stat1*t->N+stat2)]); + //shapelet_contrib_vector(modes,t->modes_n0,t->modes_beta,t->u[ci]*freq0,t->v[ci]*freq0,t->w[ci]*freq0,coh); + + + dtofcopy(8*t->modes_n0*t->modes_n0,&modesd,(double*)modes); + cudakernel_calculate_shapelet_coherencies((float)t->u[ci]*freq0,(float)t->v[ci]*freq0,modesd,factd,t->modes_n0,(float)t->modes_beta,cohd); + err=cudaFree(modesd); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaMemcpy((double*)coh, cohd, sizeof(double)*8, cudaMemcpyDeviceToHost); + checkCudaError(err,__FILE__,__LINE__); + + + complex double phterm=cos_n+_Complex_I*sin_n; + + coh[0]*=phterm; + coh[1]*=phterm; + coh[2]*=phterm; + coh[3]*=phterm; + + /* add or replace coherencies for this cluster */ + if (t->sid==0) { + /* first source will replace, resetting the accumulation to start from 0 */ + t->coh[4*t->M*ci+4*t->cid]=coh[0]; + t->coh[4*t->M*ci+4*t->cid+1]=coh[1]; + t->coh[4*t->M*ci+4*t->cid+2]=coh[2]; + t->coh[4*t->M*ci+4*t->cid+3]=coh[3]; + } else { + t->coh[4*t->M*ci+4*t->cid]+=coh[0]; + t->coh[4*t->M*ci+4*t->cid+1]+=coh[1]; + t->coh[4*t->M*ci+4*t->cid+2]+=coh[2]; + t->coh[4*t->M*ci+4*t->cid+3]+=coh[3]; + } + } + + cudaDeviceSynchronize(); + + err=cudaFree(cohd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(factd); + checkCudaError(err,__FILE__,__LINE__); + + /* reset error state */ + err=cudaGetLastError(); + + return NULL; +} +#endif /* HAVE_CUDA */ +/*****************************************************************************/ static void * shapelet_prod_one_threadfn(void *data) { thread_data_stat_t *t=(thread_data_stat_t*)data; @@ -124,6 +263,8 @@ shapelet_prod_one_threadfn(void *data) { for (int stat=0; statNstat; stat++) { //shapelet_product_jones(sp->n0,sp->n0,sh_n0,sp->beta,sp->beta,sh_beta,&C_Jq[4*sp->n0*sp->n0*stat],s_coh,&Zt[4*G*stat],Cf,1); shapelet_product_jones(t->sL,t->sM,t->sN,t->alpha,t->beta,t->gamma,&(t->h_arr[4*t->sL*t->sL*(stat+t->off)]),t->f_arr,&(t->g_arr[4*t->sN*t->sN*(stat+t->off)]),t->Cf,t->hermitian); + // For debugging, C_Jq <= s_coh + //memcpy(&(t->h_arr[4*t->sL*t->sL*(stat+t->off)]),t->f_arr,t->sL*t->sL*4*sizeof(complex double)); } return NULL; @@ -139,6 +280,8 @@ shapelet_prod_two_threadfn(void *data) { for (int stat2=stat1+t->off; stat2N; stat2++) { //shapelet_product_jones(sp->n0,sh_n0,sp->n0,sp->beta,sh_beta,sp->beta,&Jp_C_Jq[4*sp->n0*sp->n0*(stat1*N+stat2)],&Zt[4*G*stat1],&C_Jq[4*sp->n0*sp->n0*stat2],Cf,0); shapelet_product_jones(t->sL,t->sM,t->sN,t->alpha,t->beta,t->gamma,&t->h_arr[4*t->sL*t->sL*((stat1+t->off)*t->N+stat2)],&t->f_arr[4*t->sM*t->sM*(stat1+t->off)],&t->g_arr[4*t->sN*t->sN*stat2],t->Cf,t->hermitian); + // For debugging, Jp_C_Jq <= C_Jq + //memcpy(&t->h_arr[4*t->sL*t->sL*((stat1+t->off)*t->N+stat2)],&t->g_arr[4*t->sN*t->sN*stat2],t->sL*t->sL*4*sizeof(complex double)); } } return NULL; @@ -150,8 +293,7 @@ shapelet_prod_two_threadfn(void *data) { */ int recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double *x, int N, - int Nbase, baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int cid, int sh_n0, double sh_beta, complex double *Z, int Nt) { - + int Nbase, baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int cid, int sh_n0, double sh_beta, complex double *Z, int Nt, int use_cuda) { /* thread setup - divide baselines */ int Nthb0,Nthb; @@ -161,6 +303,13 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double Nthb0=(Nbase+Nt-1)/Nt; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); + + +#ifdef HAVE_CUDA + taskhist thst; + init_task_hist(&thst); +#endif /* HAVE_CUDA */ + if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); @@ -249,6 +398,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double /* get shapelet info */ exinfo_shapelet *sp=(exinfo_shapelet*) carr[cid].ex[ci]; + /* since beta -> beta*2*pi in model FT, divide back + * to get scale in image space */ + double beta_img=sp->beta/(2.0*M_PI); /* create tensor : product out (sp->n0,sp->beta), * product in (sp->n0,sp->beta) (sh_n0,sh_beta) */ @@ -256,7 +408,7 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } - shapelet_product_tensor(sp->n0,sp->n0,sh_n0,sp->beta,sp->beta,sh_beta,Cf); + shapelet_product_tensor(sp->n0,sp->n0,sh_n0,beta_img,beta_img,sh_beta,Cf); /* allocate memory to store the product C J_q^H, * n0*n0*2x2 for each station */ @@ -305,8 +457,8 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double threaddata_stat[nth1].sL=sp->n0; threaddata_stat[nth1].sM=sp->n0; threaddata_stat[nth1].sN=sh_n0; - threaddata_stat[nth1].alpha=sp->beta; - threaddata_stat[nth1].beta=sp->beta; + threaddata_stat[nth1].alpha=beta_img; + threaddata_stat[nth1].beta=beta_img;; threaddata_stat[nth1].gamma=sh_beta; threaddata_stat[nth1].h_arr=C_Jq; @@ -332,7 +484,7 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); exit(1); } - shapelet_product_tensor(sp->n0,sh_n0,sp->n0,sp->beta,sh_beta,sp->beta,Cf); + shapelet_product_tensor(sp->n0,sh_n0,sp->n0,beta_img,sh_beta,beta_img,Cf); complex double *Jp_C_Jq=0; if ((Jp_C_Jq=(complex double*)calloc((size_t)(2*N*N*2*sp->n0*sp->n0),sizeof(complex double)))==0) { @@ -355,9 +507,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double threaddata_stat[nth1].sL=sp->n0; threaddata_stat[nth1].sM=sh_n0; threaddata_stat[nth1].sN=sp->n0; - threaddata_stat[nth1].alpha=sp->beta; + threaddata_stat[nth1].alpha=beta_img; threaddata_stat[nth1].beta=sh_beta; - threaddata_stat[nth1].gamma=sp->beta; + threaddata_stat[nth1].gamma=beta_img; threaddata_stat[nth1].h_arr=Jp_C_Jq; threaddata_stat[nth1].f_arr=Zt; @@ -376,7 +528,9 @@ recalculate_diffuse_coherencies(double *u, double *v, double *w, complex double free(Cf); free(C_Jq); - /* predict visibilities */ + /* predict visibilities - use threads only using CPU prediction + * otherwise, do sequential prediction */ +#ifndef HAVE_CUDA for (int nth1=0; nth1n0; + threaddata[nth1].modes_beta=sp->beta; + pthread_create(&th_array[nth1],&attr,shapelet_pred_threadfn,(void*)(&threaddata[nth1])); + } + for (int nth1=0; nth1n0; + threaddata[nth1].modes_beta=sp->beta; + threaddata[nth1].hst=&thst; + threaddata[nth1].tid=nth1; + shapelet_pred_threadfn_cuda((void*)&threaddata[nth1]); + } + } +#endif /* HAVE_CUDA */ free(Jp_C_Jq); } +#ifdef HAVE_CUDA + destroy_task_hist(&thst); +#endif + pthread_attr_destroy(&attr); free(th_array); free(threaddata); free(threaddata_stat); free(Zt); - return 0; + + return 0; } diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index 11366ed..59aaa69 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -616,7 +616,7 @@ sharmonic_modes(int n0,double *th, double *ph, int Nt, complex double *output) { } fact[0]=1.0; for (l=1; l<=(2*n0-1); l++) { - fact[l]=(l)*fact[l-1]; + fact[l]=((double)l)*fact[l-1]; } diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu index 8b8a79f..40e9e47 100644 --- a/src/lib/Radio/predict_model.cu +++ b/src/lib/Radio/predict_model.cu @@ -95,7 +95,7 @@ __device__ float4 eval_elementcoeff(float r, float theta, int M, float beta, const float2 *pattern_theta, const float2 *pattern_phi, const float *pattern_preamble) { float4 eval={0.f,0.f,0.f,0.f}; - float rb=powf(r/beta,2); + float rb=powf(r/beta,2.0f); float ex=expf(-0.5f*rb); int idx=0; @@ -541,77 +541,6 @@ H_e(float x, int n) { return Hn; } -__device__ void -calculate_uv_mode_vectors_scalar00(float u, float v, float beta, int n0, float *Av, int *cplx) { - - int xci,zci,Ntot; - - float **shpvl, *fact; - int n1,n2,start; - float xval; - int signval; - - Ntot=2; /* u,v seperately */ - /* set up factorial array */ - fact=(float *)malloc((size_t)(n0)*sizeof(float)); - fact[0]=1.0f; - for (xci=1; xci<(n0); xci++) { - fact[xci]=((float)xci)*fact[xci-1]; - } - - /* setup array to store calculated shapelet value */ - /* need max storage Ntot x n0 */ - shpvl=(float**)malloc((size_t)(Ntot)*sizeof(float*)); - for (xci=0; xcibeta) { + so check this here and return 0, otherwise spurious nans may result, + i.e., predict only for spatial scales l,m > beta * scale_factor ~ beta * 0.01 */ + if (__fdiv_rz(SMALLEST_SPATIAL_SCALE_FAC,__fsqrt_rz(ut*ut+vt*vt))beta) { return make_cuDoubleComplex(0.0,0.0); } /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v) @@ -1539,6 +1470,139 @@ kernel_convert_time(int T, double *time_utc) { } + +__global__ void +kernel_fns_shapelet_coh(float u, float v, const float *__restrict__ modes, const float *__restrict__ fact, int n0, float beta, float *J_C_J) { + extern __shared__ float Jprod[]; /* 8*threads shared mem per block */ + /* global thread index : equal to the mode [0,n0*n0-1]*/ + unsigned int n = threadIdx.x + blockDim.x*blockIdx.x; + int tid = threadIdx.x; + + /* separate mode to n1,n2 */ + unsigned int n1=n%n0; + unsigned int n2=n/n0; + float uu=u*beta; + float vv=v*beta; + + int val_finite=__fdiv_rz(SMALLEST_SPATIAL_SCALE_FAC,__fsqrt_rz(uu*uu+vv*vv))>beta?1:0; + if (val_finite && n0; s=s/2) { + if(tid < s) { + Jprod[8*tid] += Jprod[8*(tid + s)]; + Jprod[8*tid+1] += Jprod[8*(tid + s)+1]; + Jprod[8*tid+2] += Jprod[8*(tid + s)+2]; + Jprod[8*tid+3] += Jprod[8*(tid + s)+3]; + Jprod[8*tid+4] += Jprod[8*(tid + s)+4]; + Jprod[8*tid+5] += Jprod[8*(tid + s)+5]; + Jprod[8*tid+6] += Jprod[8*(tid + s)+6]; + Jprod[8*tid+7] += Jprod[8*(tid + s)+7]; + } + __syncthreads(); + } + + /* copy back the sum to proper location in ed */ + if(tid==0) { + J_C_J[8*blockIdx.x]=Jprod[0]; + J_C_J[8*blockIdx.x+1]=Jprod[1]; + J_C_J[8*blockIdx.x+2]=Jprod[2]; + J_C_J[8*blockIdx.x+3]=Jprod[3]; + J_C_J[8*blockIdx.x+4]=Jprod[4]; + J_C_J[8*blockIdx.x+5]=Jprod[5]; + J_C_J[8*blockIdx.x+6]=Jprod[6]; + J_C_J[8*blockIdx.x+7]=Jprod[7]; + } +} + + +__global__ void +plus_reduce(const float *__restrict__ input, int N, int blockDim_2, double *coh) { + // Each block loads its 8 elements into shared memory + extern __shared__ float x[]; + int tid = threadIdx.x; + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i 1) { + int halfPoint = (nTotalThreads >> 1); // divide by two + if (tid < halfPoint) { + int thread2 = tid + halfPoint; + if (thread2 < blockDim.x) { // Skipping the fictitious threads blockDim.x ... blockDim_2-1 + x[8*tid] = x[8*tid]+x[8*thread2]; + x[8*tid+1] = x[8*tid+1]+x[8*thread2+1]; + x[8*tid+2] = x[8*tid+2]+x[8*thread2+2]; + x[8*tid+3] = x[8*tid+3]+x[8*thread2+3]; + x[8*tid+4] = x[8*tid+4]+x[8*thread2+4]; + x[8*tid+5] = x[8*tid+5]+x[8*thread2+5]; + x[8*tid+6] = x[8*tid+6]+x[8*thread2+6]; + x[8*tid+7] = x[8*tid+7]+x[8*thread2+7]; + } + } + __syncthreads(); + nTotalThreads = halfPoint; // Reducing the binary tree size by two + } + + /* add back to total */ + if( tid == 0 ) { + coh[0]=(double)x[tid]; + coh[1]=(double)x[tid+1]; + coh[2]=(double)x[tid+2]; + coh[3]=(double)x[tid+3]; + coh[4]=(double)x[tid+4]; + coh[5]=(double)x[tid+5]; + coh[6]=(double)x[tid+6]; + coh[7]=(double)x[tid+7]; + } +} + /* only use extern if calling code is C */ extern "C" { @@ -1822,4 +1886,57 @@ cudakernel_convert_time(int T, double *time_utc) { } +/* need power of 2 for tree reduction to work */ +static int +NearestPowerOf2 (int n){ + if (!n) return n; //(0 == 2^0) + + int x = 1; + while(x < n) { + x <<= 1; + } + return x; +} + +#define CUDA_DBG +/* calculate visibilites for shapelet model at u,v, +modes: (device memory) n0*n0*(2x2)x2 double, n0*n0*(2x2) complex double +fact: (device memory) n0 factorial array +coh: (device memory) 8x1 double, 4x1 complex double +*/ +void +cudakernel_calculate_shapelet_coherencies(float u, float v, float *modes, float *fact, int n0, float beta, double *coh) { + +#ifdef CUDA_DBG + cudaError_t error; + error = cudaGetLastError(); +#endif + + /* split n0*n0 modes into threads */ + int ThreadsPerBlock=DEFAULT_TH_PER_BK; + int BlocksPerGrid=(n0*n0+ThreadsPerBlock-1)/ThreadsPerBlock; + /* each thread computes basis for that mode (n1,n2), finds the product + of basis with modes (2x2 complex), + thereafter, summation over each block, and result written back to global mem */ + + /* global mem to store summation per block */ + float *J_C_J; + cudaMalloc((void**)&J_C_J, 8*sizeof(float)*BlocksPerGrid); + cudaMemset(J_C_J, 0, 8*sizeof(float)*BlocksPerGrid); + /* shared mem: 8*ThreadsPerBlock */ + kernel_fns_shapelet_coh<<< BlocksPerGrid, ThreadsPerBlock, 8*sizeof(float)*ThreadsPerBlock >>>(-u, v, modes, fact, n0, beta, J_C_J); + + /* launch 1 block, threads=BlocksPerGrid */ + plus_reduce<<< 1, BlocksPerGrid, 8*sizeof(float)*BlocksPerGrid>>>(J_C_J, BlocksPerGrid, NearestPowerOf2(BlocksPerGrid), coh); + + cudaFree(J_C_J); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + exit(-1); + } +#endif +} } /* extern "C" */ diff --git a/src/lib/Radio/predict_withbeam_cuda.c b/src/lib/Radio/predict_withbeam_cuda.c index e56a445..c5d6151 100644 --- a/src/lib/Radio/predict_withbeam_cuda.c +++ b/src/lib/Radio/predict_withbeam_cuda.c @@ -135,7 +135,7 @@ typedef struct thread_data_corr_t_ { /* copy Nx1 double array x to device as float first allocate device memory */ -static void +void dtofcopy(int N, float **x_d, double *x) { float *xhost=0; cudaError_t err; @@ -166,7 +166,6 @@ dtofcopy(int N, float **x_d, double *x) { checkCudaError(err,__FILE__,__LINE__); *x_d=xc; - } static void * diff --git a/src/lib/Radio/shapelet.c b/src/lib/Radio/shapelet.c index 1d79f8d..dc6a858 100644 --- a/src/lib/Radio/shapelet.c +++ b/src/lib/Radio/shapelet.c @@ -63,7 +63,7 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double } fact[0]=1.0; for (xci=1; xci<(n0); xci++) { - fact[xci]=(xci)*fact[xci-1]; + fact[xci]=((double)xci)*fact[xci-1]; } /* setup array to store calculated shapelet value */ @@ -85,13 +85,13 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double xval=u*beta; double expval=exp(-0.5*xval*xval); for (xci=0; xci