Merge pull request #423 from etmc/DDalphaAMG_nd_merge_etmc_master

merge DDalphaAMG_nd branch into etmc/tmLQCD/master
etmc · May 13, 2019 · 4fbdef8 · 4fbdef8
2 parents 5955fdb + 42356fc
commit 4fbdef8
Show file tree

Hide file tree

Showing 58 changed files with 2,986 additions and 540 deletions.
diff --git a/DDalphaAMG_interface.c b/DDalphaAMG_interface.c
diff --git a/DDalphaAMG_interface.h b/DDalphaAMG_interface.h
@@ -26,16 +26,20 @@
 #include "global.h"
 #include "su3.h"
 #include"solver/matrix_mult_typedef.h"
+#include"solver/matrix_mult_typedef_nd.h"
 
 extern int mg_setup_iter;
 extern int mg_coarse_setup_iter;
 extern int mg_update_setup_iter;
+extern int mg_update_gauge;
 extern int mg_omp_num_threads;
 extern int mg_Nvec;
 extern int mg_lvl;
 extern int mg_blk[4];
 extern int mg_mixed_prec;
 extern int mg_setup_mu_set;
+extern int mg_no_shifts;
+extern double mg_mms_mass;
 extern double mg_setup_mu;
 extern double mg_cmu_factor;
 extern double mg_dtau_update;
@@ -44,6 +48,7 @@ extern double mg_rho_update;
 void MG_init(void);
 void MG_update_gauge(double step);
 void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD);
+void MG_update_mubar_epsbar(double mubar_tmLQCD, double epsbar_tmLQCD, double shift_tmLQCD);
 void MG_reset(void);
 void MG_finalize(void);
 
@@ -56,4 +61,22 @@ int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
 		 const double precision, const int max_iter, const int rel_prec,
 		 const int N, su3 **gf, matrix_mult_full f_full);
 
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+		 spinor * const up_old, spinor * const dn_old,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_nd f);
+
+int MG_solver_nd_eo(spinor * const Even_new_up, spinor * const Odd_new_up, 
+                    spinor * const Even_new_dn, spinor * const Odd_new_dn,
+                    spinor * const Even_up, spinor * const Odd_up,
+                    spinor * const Even_dn, spinor * const Odd_dn,
+                    const double precision, const int max_iter, const int rel_prec,
+                    const int N, su3 **gf, matrix_mult_full_nd f_full);
+
+int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new,
+                     spinor * const up_old, spinor * const dn_old,
+                     const double * shifts, const int no_shifts,
+                     const double * precision, const int max_iter, const int rel_prec,
+                     const int N, su3 **gf, matrix_mult_nd f);
+
 #endif /* DDalphaAMG_INTERFACE_H_ */
diff --git a/default_input_values.h b/default_input_values.h
@@ -50,6 +50,7 @@
 #define _default_g_mu1 0.0
 #define _default_g_mu2 0.0
 #define _default_g_mu3 0.0
+#define _default_g_shift 0.0
 #define _default_c_sw -1.0
 #define _default_g_beta 6.0
 #define _default_g_N_s 20

diff --git a/doc/DDalphaAMG.tex b/doc/DDalphaAMG.tex
@@ -88,6 +88,8 @@ \subsubsection{More advanced settings}
 	\item[\texttt{MGdtauUpdate:}] for HMC, $d\tau$ interval after that the setup is updated. If 0, it will be updated every time the configuration is changed.
 	\item[\texttt{MGrhoUpdate:}] for HMC, rho value of the monomial at which the setup have to be updated. It can be combined with \texttt{MGdtauUpdate} or used standalone.
 	\item[\texttt{MGUpdateSetupIter:}] for HMC, number of setup iterations to do on the fine level when the setup has to be updated.
+	\item[\texttt{MGNumberOfShifts:}] for MG in multi-shift systems, number of shifted linear systems, N, to be solved by DDalphaAMG. MG will solve the N smaller shifts.
+	\item[\texttt{MGMMSMass:}] for MG in multi-shift systems, alternative to the previous. MG will solve all the mass-shifts smaller than the given value.
 \end{description}
 \subsubsection{Output analysis\label{sec:DDalphaAMG_output}}
 Running tmLQCD programs with the option \texttt{-v}, the full output of DDalphaAMG is shown. Here some hints on the informations given. Just before the setup, the full set of parameters is printed, with an output similar to the following:

diff --git a/expo.c b/expo.c
@@ -52,48 +52,132 @@
 #include "su3.h"
 #include "su3adj.h"
 #include "expo.h"
+#include "float.h"
+#include "global.h"
 
-void exposu3(su3* const vr, const su3adj* const p) {
-  int i;
-  su3 ALIGN v,v2;
-  double ALIGN fac,r;
-  double ALIGN a,b;
-  _Complex double ALIGN a0,a1,a2,a1p;
+static double imag_det(const su3adj* p) {
+  double d,tos3,o3,os3;
+  tos3=2.0/sqrt(3.0);
+  o3=1.0/3.0;
+  os3=1.0/sqrt(3.0);
+
+  d=tos3*(*p).d8*(o3*(*p).d8*(*p).d8-(*p).d3*(*p).d3)+2*((*p).d2*(*p).d4*(*p).d7-(*p).d1*(*p).d4*(*p).d6-(*p).d2*(*p).d5*(*p).d6-(*p).d1*(*p).d5*(*p).d7);
+  d+=(os3*(*p).d8-(*p).d3)*((*p).d4*(*p).d4+(*p).d5*(*p).d5)+(os3*(*p).d8+(*p).d3)*((*p).d6*(*p).d6+(*p).d7*(*p).d7)-tos3*(*p).d8*((*p).d1*(*p).d1+(*p).d2*(*p).d2);	
+  return d;
+}
+
+static void mul_su3alg(su3adj* p,double d) {
+  (*p).d1*=d;
+  (*p).d2*=d;
+  (*p).d3*=d;
+  (*p).d4*=d;
+  (*p).d5*=d;
+  (*p).d6*=d;
+  (*p).d7*=d;
+  (*p).d8*=d;
+}
 
-  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v' */  
+void init_exposu3() {
+  int k;
+  double fctr = 1.0;
+  g_exposu3_no_c = 0;
+
+  while (fctr>DBL_EPSILON) {
+    g_exposu3_no_c++;
+    fctr/=(double)(g_exposu3_no_c);
+  }
+  g_exposu3_no_c += 7;
+  g_exposu3_no_c += (g_exposu3_no_c%2);
+
+  g_exposu3_c=malloc((g_exposu3_no_c+1)*sizeof(*g_exposu3_c));
+
+  g_exposu3_c[0]=1.0;
+  for (k=0; k < g_exposu3_no_c; k++)
+    g_exposu3_c[k+1]=g_exposu3_c[k]/(double)(k+1);
+}
+
+void exposu3(su3* const vr, const su3adj* const p) {
+  int n,m,mm;
+  su3 ALIGN v,v2,vt;
+  su3adj pa;
+  double ALIGN d,tc;
+  _Complex double t;
+  _Complex double ALIGN p0,p1,p2;
+  _Complex double ALIGN q0,q1,q2;
+
   _make_su3(v,*p);
+  _su3_times_su3(v2,v,v);
+  tc = -2.0*(v2.c00 +v2.c11+v2.c22);
+
+  pa.d1=(*p).d1;
+  pa.d2=(*p).d2;
+  pa.d3=(*p).d3;
+  pa.d4=(*p).d4;
+  pa.d5=(*p).d5;
+  pa.d6=(*p).d6;
+  pa.d7=(*p).d7;
+  pa.d8=(*p).d8;
+
+  mm=0;
+  while (tc>1.0) {
+    mul_su3alg(&pa,0.5);
+    tc*=0.5;
+    mm+=1;
+  }
+
+  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v'  */
+  _make_su3(v,pa);
   /* calculates v^2 */
   _su3_times_su3(v2,v,v);
-  /* */
-  a = 0.5 * (creal(v2.c00) + creal(v2.c11) + creal(v2.c22));
-  /* 1/3 imaginary part of tr v*v2 */
-  b = 0.33333333333333333 * cimag(v.c00 * v2.c00 + v.c01 * v2.c10 + v.c02 * v2.c20 +
-                                  v.c10 * v2.c01 + v.c11 * v2.c11 + v.c12 * v2.c21 +
-                                  v.c20 * v2.c02 + v.c21 * v2.c12 + v.c22 * v2.c22  );
-  a0  = 0.16059043836821615e-9;
-  a1  = 0.11470745597729725e-10;
-  a2  = 0.76471637318198165e-12;
-  fac = 0.20876756987868099e-8;      /*  1/12! */
-  r   = 12.0;
-  for(i = 3; i <= 15; ++i)
-  {
-    a1p = a0 + a * a2;
-    a0 = fac + b * I * a2;
-    a2 = a1;
-    a1 = a1p;
-    fac *= r;
-    r -= 1.0;
+  /* t= -tr(X^2)/2*/
+  t = -0.5*(v2.c00 +v2.c11+v2.c22);
+  /* d= -1i * det(X)*/
+  d=-imag_det(&pa);
+ /*  printf(" d= %.16f and t=%.16f + 1i %.16f \n",d,creal(t),cimag(t));*/
+
+  if(fabs(d)>(1.000001*(1.000002-fabs(t))))
+    printf("The norm of X is larger than 1 and N = %d \n", g_exposu3_no_c);
+
+
+  p0=g_exposu3_c[g_exposu3_no_c];
+  p1=0.0;
+  p2=0.0;
+
+  for (n=(g_exposu3_no_c-1);n>=0;n--) {
+    q0=p0;
+    q1=p1;
+    q2=p2;
+
+    p0=g_exposu3_c[n]-I*d*q2;
+    p1=q0-t*q2;
+    p2=q1;
   }
+
   /* vr = a0 + a1*v + a2*v2 */
-  vr->c00 = a0 + a1 * v.c00 + a2 * v2.c00;
-  vr->c01 =      a1 * v.c01 + a2 * v2.c01;
-  vr->c02 =      a1 * v.c02 + a2 * v2.c02;
-  vr->c10 =      a1 * v.c10 + a2 * v2.c10;
-  vr->c11 = a0 + a1 * v.c11 + a2 * v2.c11;
-  vr->c12 =      a1 * v.c12 + a2 * v2.c12;
-  vr->c20 =      a1 * v.c20 + a2 * v2.c20;
-  vr->c21 =      a1 * v.c21 + a2 * v2.c21;
-  vr->c22 = a0 + a1 * v.c22 + a2 * v2.c22;
+  vt.c00 = p0 + p1 * v.c00 + p2 * v2.c00;
+  vt.c01 =      p1 * v.c01 + p2 * v2.c01;
+  vt.c02 =      p1 * v.c02 + p2 * v2.c02;
+  vt.c10 =      p1 * v.c10 + p2 * v2.c10;
+  vt.c11 = p0 + p1 * v.c11 + p2 * v2.c11;
+  vt.c12 =      p1 * v.c12 + p2 * v2.c12;
+  vt.c20 =      p1 * v.c20 + p2 * v2.c20;
+  vt.c21 =      p1 * v.c21 + p2 * v2.c21;
+  vt.c22 = p0 + p1 * v.c22 + p2 * v2.c22;
+
+  for(m=0;m<mm;m++) {
+    _su3_times_su3(v2,vt,vt);
+    vt=v2;
+  }
+
+  vr->c00=vt.c00;
+  vr->c01=vt.c01; 
+  vr->c02=vt.c02; 
+  vr->c10=vt.c10;
+  vr->c11=vt.c11;
+  vr->c12=vt.c12;
+  vr->c20=vt.c20;
+  vr->c21=vt.c21;
+  vr->c22=vt.c22;
 }
 
 void exposu3_check(su3* const vr, const su3adj* const p, int im) {
@@ -135,6 +219,12 @@ void restoresu3(su3* const vr, const su3* const u) {
   vr->c20 = conj(vr->c01 * vr->c12 - vr->c02 * vr->c11);
   vr->c21 = conj(vr->c02 * vr->c10 - vr->c00 * vr->c12);
   vr->c22 = conj(vr->c00 * vr->c11 - vr->c01 * vr->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  vr->c10 = conj(vr->c21 * vr->c02 - vr->c22 * vr->c01);
+  vr->c11 = conj(vr->c22 * vr->c00 - vr->c20 * vr->c02);
+  vr->c12 = conj(vr->c20 * vr->c01 - vr->c21 * vr->c00);
+
 }
 
 void restoresu3_in_place(su3* const u) {
@@ -156,6 +246,12 @@ void restoresu3_in_place(su3* const u) {
   u->c20 = conj(u->c01 * u->c12 - u->c02 * u->c11);
   u->c21 = conj(u->c02 * u->c10 - u->c00 * u->c12);
   u->c22 = conj(u->c00 * u->c11 - u->c01 * u->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  u->c10 = conj(u->c21 * u->c02 - u->c22 * u->c01);
+  u->c11 = conj(u->c22 * u->c00 - u->c20 * u->c02);
+  u->c12 = conj(u->c20 * u->c01 - u->c21 * u->c00);
+
 }
 
 /* Exponentiates a hermitian 3x3 matrix Q */

diff --git a/expo.h b/expo.h
@@ -19,10 +19,11 @@
 #ifndef _EXPO_H
 #define _EXPO_H
 
-extern void exposu3(su3* const vr, const su3adj* const p);
-extern void exposu3_check(su3* const vr, const su3adj* const p, int im);
-extern void restoresu3(su3* const vr, const su3* const u);
-extern void restoresu3_in_place(su3* const u);
-extern void exposu3_in_place(su3* const u);
+void init_exposu3();
+void exposu3(su3* const vr, const su3adj* const p);
+void exposu3_check(su3* const vr, const su3adj* const p, int im);
+void restoresu3(su3* const vr, const su3* const u);
+void restoresu3_in_place(su3* const u);
+void exposu3_in_place(su3* const u);
 
 #endif
diff --git a/global.h b/global.h
@@ -195,7 +195,7 @@ EXTERN su3adj ** ddummy;
 
 EXTERN int count00,count01,count10,count11,count20,count21;
 EXTERN double g_kappa, g_c_sw, g_beta;
-EXTERN double g_mu, g_mu1, g_mu2, g_mu3;
+EXTERN double g_mu, g_mu1, g_mu2, g_mu3, g_shift;
 EXTERN double g_rgi_C0, g_rgi_C1;
 
 /* Parameters for non-degenrate case */
@@ -212,6 +212,10 @@ EXTERN int g_mpi_z_rank;
 EXTERN int g_mpi_ST_rank;
 EXTERN int g_nb_list[8];
 
+/* Variables for exposu3 */
+EXTERN int g_exposu3_no_c;
+EXTERN double * g_exposu3_c;
+
 /* OpenMP Kahan accumulation arrays */
 EXTERN _Complex double *g_omp_acc_cp;
 EXTERN double* g_omp_acc_re;
@@ -282,3 +286,14 @@ void fatal_error(char const *error, char const *function);
 
 #endif
 
+/*
+ * Comments: generic macro for swapping values or pointers.
+ * We use memcpy because is optimal when the amount to copy is known at compilation time. 
+ * "sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1" is a compile time check that the types are compatible.
+ */
+#define SWAP(x,y) do \
+{ unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \
+  memcpy(swap_temp,&y,sizeof(x)); \
+  memcpy(&y,&x,       sizeof(x)); \
+  memcpy(&x,swap_temp,sizeof(x)); \
+} while(0)
diff --git a/init/init_gauge_field.c b/init/init_gauge_field.c
@@ -27,6 +27,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "init_gauge_field.h"
+#include "expo.h"
 
 su3 * gauge_field = NULL;
 su3_32 * gauge_field_32 = NULL;
@@ -48,6 +49,8 @@ int init_gauge_field(const int V, const int back) {
   g_gauge_field_copy = NULL;
 #endif
 
+  if (g_exposu3_no_c == 0) init_exposu3();
+
   if((void*)(g_gauge_field = (su3**)calloc(V, sizeof(su3*))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;

diff --git a/init/init_stout_smear_vars.c b/init/init_stout_smear_vars.c
@@ -28,6 +28,7 @@
 #include "global.h"
 #include "su3.h"
 #include "sse.h"
+#include "expo.h"
 #include "init_stout_smear_vars.h"
 
 su3 * gauge_field_saved;
@@ -91,6 +92,8 @@ int init_stout_smear_vars(const int V, const int stout_no_iter)
   k = 0;
   mu = 0;
 
+  if (g_exposu3_no_c == 0) init_exposu3();
+
   /*
    *  this is the field where we store the smeared force matrices \Sigma^{(k)}_\mu(x)
    *  eqtn (44) hep-lat/0311018

diff --git a/invert.c b/invert.c
@@ -94,6 +94,7 @@
 #endif
 #include "meas/measurements.h"
 #include "source_generation.h"
+#include "expo.h"
 
 #define CONF_FILENAME_LENGTH 500
 
@@ -179,7 +180,6 @@ int main(int argc, char *argv[])
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 0);  
 #endif
-
   if (j != 0) {
     fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
     exit(-1);
@@ -296,7 +296,6 @@ int main(int argc, char *argv[])
       exit(-2);
     }
 
-
     if (g_cart_id == 0) {
       printf("# Finished reading gauge field.\n");
       fflush(stdout);