diff --git a/DDalphaAMG_interface.c b/DDalphaAMG_interface.c
index 4b24928fc..bc655806e 100644
--- a/DDalphaAMG_interface.c
+++ b/DDalphaAMG_interface.c
@@ -28,12 +28,15 @@
 int mg_setup_iter;
 int mg_coarse_setup_iter;
 int mg_update_setup_iter;
+int mg_update_gauge;
 int mg_omp_num_threads;
 int mg_Nvec;
 int mg_lvl;
 int mg_blk[4];
 int mg_mixed_prec;
 int mg_setup_mu_set;
+int mg_no_shifts = 0;
+double mg_mms_mass = 0;
 double mg_setup_mu;
 double mg_cmu_factor;
 double mg_dtau_update;
@@ -65,20 +68,28 @@ void MG_finalize(void) {
 }
 
 int MG_solver(spinor * const phi_new, spinor * const phi_old,
-	      const double precision, const int max_iter,const int rel_prec,
-	      const int N, su3 **gf, matrix_mult f) {
+              const double precision, const int max_iter,const int rel_prec,
+              const int N, su3 **gf, matrix_mult f) {
     printf("ERROR: MG_solver called but DDalphaAMG library not included.\n");
     exit(1);
 }
 
 int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
-		 spinor * const Even, spinor * const Odd,
-		 const double precision, const int max_iter, const int rel_prec,
-		 const int N, su3 **gf, matrix_mult_full f_full) {
+                 spinor * const Even, spinor * const Odd,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_full f_full) {
     printf("ERROR: MG_solver_eo called but DDalphaAMG library not included.\n");
     exit(1);
 }
 
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+		 spinor * const up_old, spinor * const dn_old,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_nd f) {
+    printf("ERROR: MG_solver_nd called but DDalphaAMG library not included.\n");
+    exit(1);
+}
+
 #else
 #include <stdio.h>
 #include <stdlib.h>
@@ -92,13 +103,17 @@ int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
 #include "read_input.h"
 #include "DDalphaAMG.h"
 #include "linalg_eo.h"
+#include "phmc.h"
 #include "operator/D_psi.h"
 #include "operator/tm_operators.h"
+#include "operator/tm_operators_nd.h"
 #include "operator/clovertm_operators.h"
+#include "operator/Hopping_Matrix.h"
 
-//Enable to test the solution. It cost an application more of the operator. 
-//TODO: test all the operators interfaced and then undefine this flag.
-#define MGTEST
+//Enable variant for shifted operator in the ND sector.
+//The variant is used in case of initial guess for the squared operator.
+//It is faster and tests prove also to be safe (read Appendix A of arxiv:1801.##### by S.Bacchio et al.)
+#define VARIANT_FOR_SHIFTED
 
 DDalphaAMG_init mg_init;
 DDalphaAMG_parameters mg_params;
@@ -116,6 +131,8 @@ int mg_lvl=3;
 int mg_blk[4] = {0, 0, 0, 0};
 int mg_mixed_prec=0;
 int mg_setup_mu_set = 0; //flag that enable the use of mg_setup_mu in the setup phase
+int mg_no_shifts = 0; // number of shifts to invert with MG. MMS-CG is used for the others at larger mass.
+double mg_mms_mass = 0.1; // mass shift value for switching from MMS-CG to MG. MMS-CG is used for larger masses than the value.
 double mg_setup_mu = 0.; 
 double mg_cmu_factor = 1.0;
 double mg_dtau_update = 0.0;
@@ -166,7 +183,7 @@ static int vector_index_fct(int t, int z, int y, int x )
    return id;
 }
 
-static int MG_check(spinor * const phi_new, spinor * const phi_old, const int N, const double precision, matrix_mult f) 
+static inline int MG_check(spinor * const phi_new, spinor * const phi_old, const int N, const double precision, matrix_mult f) 
 {
   double differ[2], residual;
   spinor ** check_vect = NULL;
@@ -199,6 +216,94 @@ static int MG_check(spinor * const phi_new, spinor * const phi_old, const int N,
   
 }
 
+static inline int MG_check_nd( spinor * const up_new, spinor * const dn_new, spinor * const up_old, spinor * const dn_old,
+                               const int N, const double precision, matrix_mult_nd f) 
+{
+  double differ[2], residual;
+  spinor ** check_vect = NULL;
+  double acc_factor = 4;
+#ifdef VARIANT_FOR_SHIFTED
+  if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+        f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+     && g_shift != 0 )
+    acc_factor = 1/sqrt(phmc_cheb_evmin/phmc_cheb_evmax + g_shift);
+#endif
+
+  init_solver_field(&check_vect, VOLUMEPLUSRAND,2);
+  f( check_vect[0], check_vect[1], up_new, dn_new);
+  diff( check_vect[0], check_vect[0], up_old, N);
+  diff( check_vect[1], check_vect[1], dn_old, N);
+  differ[0] = sqrt(square_norm(check_vect[0], N, 1)+square_norm(check_vect[1], N, 1));
+  differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  finalize_solver(check_vect, 2);
+  
+  residual = differ[0]/differ[1];
+  
+  if( residual > precision && residual < acc_factor*precision ) {
+    if(g_proc_id == 0)
+      printf("WARNING: solution accepted even if the residual wasn't complitely acceptable (%e > %e). Max acc. factor %f.\n", residual, precision, acc_factor);
+  } else if( residual > acc_factor*precision ) {
+    if(g_proc_id == 0) {
+      printf("ERROR: something bad happened... MG converged giving the wrong solution!! Trying to restart... \n");
+      printf("ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e \n", differ[0],differ[1],differ[0]/differ[1],precision);
+    }
+    return 0;
+  } 
+
+  if (g_debug_level > 0 && g_proc_id == 0)
+    printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n", differ[0],differ[1],differ[0]/differ[1]);
+  
+  return 1;
+  
+}
+
+static inline int MG_mms_check_nd( spinor **const up_new, spinor **const dn_new, 
+                                   spinor * const up_old, spinor * const dn_old,
+                                   const double * shifts, const int no_shifts, 
+                                   const int N, double * precision, matrix_mult_nd f) 
+{
+  double differ[2], residual;
+  spinor ** check_vect = NULL;
+  double acc_factor = 2;
+  
+  init_solver_field(&check_vect, VOLUMEPLUSRAND,2);
+
+  for( int i = 0; i < no_shifts; i++ ) {
+
+    g_shift = shifts[i]*shifts[i]; 
+
+    f( check_vect[0], check_vect[1], up_new[i], dn_new[i]);
+    diff( check_vect[0], check_vect[0], up_old, N);
+    diff( check_vect[1], check_vect[1], dn_old, N);
+    differ[0] = sqrt(square_norm(check_vect[0], N, 1)+square_norm(check_vect[1], N, 1));
+    differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  
+    residual = differ[0]/differ[1];
+    
+    if( residual > precision[i] && residual < acc_factor*precision[i] ) {
+      if(g_proc_id == 0)
+        printf("WARNING: solution accepted even if the residual wasn't complitely acceptable (%e > %e) \n", residual, precision[i]);
+    } else if( residual > acc_factor*precision[i] ) {
+      if(g_proc_id == 0) {
+        printf("ERROR: something bad happened... MG converged giving the wrong solution!! Trying to restart... \n");
+        printf("ERROR contd: || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e > %e \n", differ[0],differ[1],differ[0]/differ[1],precision[i]);
+      }
+      finalize_solver(check_vect, 2);
+      return 0;
+    } 
+    
+    if (g_debug_level > 0 && g_proc_id == 0)
+      printf("MGTEST:  || s - f_{tmLQC} * f_{DDalphaAMG}^{-1} * s || / ||s|| = %e / %e = %e \n", differ[0],differ[1],differ[0]/differ[1]);
+    
+  }
+
+  finalize_solver(check_vect, 2);
+
+  return 1;
+  
+}
+
+
 static int MG_pre_solve( su3 **gf )
 {
   
@@ -239,7 +344,7 @@ static int MG_pre_solve( su3 **gf )
   if (mg_do_setup==1) {
     if( mg_setup_mu_set ) {
       if (g_proc_id == 0)
-	printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+        printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0); 
     } else
       MG_update_mu(g_mu, 0);
@@ -248,7 +353,7 @@ static int MG_pre_solve( su3 **gf )
     DDalphaAMG_setup(&mg_status);
     mg_do_setup = 0;
     mg_tau = gauge_tau;
-    if (mg_status.success && g_proc_id == 0)	
+    if (mg_status.success && g_proc_id == 0)
       printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
              mg_status.time, 100.*(mg_status.coarse_time/mg_status.time));
     else if ( g_proc_id == 0)
@@ -258,7 +363,7 @@ static int MG_pre_solve( su3 **gf )
   if (mg_update_setup>0) {
     if( mg_setup_mu_set ) {
       if (g_proc_id == 0)
-	printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
+        printf("DDalphaAMG using mu=%f during setup\n", mg_setup_mu);
       MG_update_mu(mg_setup_mu, 0); 
     } else
       MG_update_mu(g_mu, 0);
@@ -267,9 +372,9 @@ static int MG_pre_solve( su3 **gf )
     DDalphaAMG_update_setup(mg_update_setup, &mg_status);
     mg_update_setup = 0;
     mg_tau = gauge_tau;
-    if (mg_status.success && g_proc_id == 0)	
+    if (mg_status.success && g_proc_id == 0)
       printf("DDalphaAMG setup ran, time %.2f sec (%.2f %% on coarse grid)\n",
-	     mg_status.time, 100.*(mg_status.coarse_time/mg_status.time));
+             mg_status.time, 100.*(mg_status.coarse_time/mg_status.time));
     else if ( g_proc_id == 0)
       printf("ERROR: setup updating did not run correctly");
   }
@@ -278,7 +383,7 @@ static int MG_pre_solve( su3 **gf )
 }
 
 static int MG_solve(spinor * const phi_new, spinor * const phi_old, const double precision,
-						  const int N, matrix_mult f)
+                    const int N, matrix_mult f)
 {
   
   // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
@@ -302,68 +407,80 @@ static int MG_solve(spinor * const phi_new, spinor * const phi_old, const double
   
   // Checking if the operator is in the list and compatible with N
   if (      f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
-	    f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
-	    f == Mtm_plus_psi ||  //          Schur complement with plus mu 
-	    f == Msw_plus_psi ||  //          Schur complement with plus mu
-	    f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
-	    f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
-	    f == Mtm_minus_psi || //          Schur complement with minus mu 
-	    f == Msw_minus_psi || //          Schur complement with minus mu
-	    f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
-	    f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
-	    f == Qtm_pm_psi ||    //          Schur complement squared
-	    f == Qsw_pm_psi ) {   //          Schur complement squared
+            f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
+            f == Mtm_plus_psi ||  //          Schur complement with plus mu 
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == Mtm_minus_psi || //          Schur complement with minus mu 
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
+            f == Qtm_pm_psi ||    //          Schur complement squared
+            f == Qsw_pm_psi ) {   //          Schur complement squared
     if( N != VOLUME/2 && g_proc_id == 0 )
       printf("WARNING: expected N == VOLUME/2 for the required operator in MG_solve. Continuing with N == VOLUME\n");
   }
   else if ( f == D_psi ||         //          Full operator    with plus mu
-	    f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
-	    f == Q_minus_psi ||   // Gamma5 - Full operator    with minus mu
-	    f == Q_pm_psi ) {     //          Full operator    squared
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Q_minus_psi ||   // Gamma5 - Full operator    with minus mu
+            f == Q_pm_psi ||      //          Full operator    squared
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi   || //          Full operator    squared
+            f == Msw_full_minus_psi) {//         Full operator    with minus mu
     if( N != VOLUME && g_proc_id == 0 )
       printf("WARNING: expected N == VOLUME for the required operator in MG_solve. Continuing with N == VOLUME/2\n");
   }
   else if( g_proc_id == 0 )
     printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
-	   N==VOLUME?"D_psi":"Msw_plus_psi");
+           N==VOLUME?"D_psi":"Msw_plus_psi");
 
   // Setting mu
   if (      f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
-	    f == Qsw_psi )        // Gamma5 - Schur complement with mu=0 on odd sites
+            f == Qsw_psi )        // Gamma5 - Schur complement with mu=0 on odd sites
     MG_update_mu(g_mu, -g_mu);
   else if ( f == Mtm_minus_psi || //          Schur complement with minus mu 
-	    f == Msw_minus_psi || //          Schur complement with minus mu
-	    f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
-	    f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
-	    f == Q_minus_psi )    // Gamma5 - Full operator    with minus mu
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Msw_full_minus_psi|| //         Full operator    with minus mu
+            f == Q_minus_psi )    // Gamma5 - Full operator    with minus mu
     MG_update_mu(-g_mu, -g_mu3);
   else if ( f == Mtm_plus_psi ||  //          Schur complement with plus mu 
-	    f == Msw_plus_psi ||  //          Schur complement with plus mu
-	    f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
-	    f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
-	    f == D_psi ||         //          Full operator    with plus mu
-	    f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
-	    f == Qtm_pm_psi ||    //          Schur complement squared
-	    f == Qsw_pm_psi ||    //          Schur complement squared
-	    f == Q_pm_psi )       //          Full operator    squared
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == D_psi ||         //          Full operator    with plus mu
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Qtm_pm_psi ||    //          Schur complement squared
+            f == Qsw_pm_psi ||    //          Schur complement squared
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi   || //          Full operator    squared
+            f == Q_pm_psi )       //          Full operator    squared
     MG_update_mu(g_mu, g_mu3); 
   else
     MG_update_mu(g_mu, g_mu3); 
 
   //Solving
   if (      f == Qtm_plus_psi ||  // Gamma5 - Schur complement with plus mu 
-	    f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
-	    f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
-	    f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu 
-	    f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
-	    f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
-	    f == Q_minus_psi ) {  // Gamma5 - Full operator    with minus mu
-    mul_gamma5(old, VOLUME);
+            f == Qsw_plus_psi ||  // Gamma5 - Schur complement with plus mu
+            f == Qtm_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_minus_psi || // Gamma5 - Schur complement with minus mu 
+            f == Qsw_psi ||       // Gamma5 - Schur complement with mu=0 on odd sites
+            f == Q_plus_psi ||    // Gamma5 - Full operator    with plus mu 
+            f == Q_minus_psi ||   // Gamma5 - Full operator    with minus mu
+            f == Qsw_full_plus_psi || // Gamma5 - Full operator    with plus mu
+            f == Qsw_full_minus_psi|| //Gamma5 - Full operator    with plus mu
+            f == Qsw_full_pm_psi ) {  //          Full operator    squared
+    mul_gamma5((spinor *const) old, VOLUME);
     DDalphaAMG_solve( new, old, precision, &mg_status );
-    mul_gamma5(old, VOLUME);
+    if( N == VOLUME ) // in case of VOLUME/2 old is a just local vector
+      mul_gamma5((spinor *const) old, VOLUME);
   }
   else if ( f == Qtm_pm_psi ||    //          Schur complement squared
-	    f == Qsw_pm_psi ) {   //          Schur complement squared
+            f == Qsw_pm_psi ) {   //          Schur complement squared
     mg_scale *= mg_scale;
     DDalphaAMG_solve_squared_odd( new, old, precision, &mg_status );
   }
@@ -372,12 +489,14 @@ static int MG_solve(spinor * const phi_new, spinor * const phi_old, const double
     DDalphaAMG_solve_squared( new, old, precision, &mg_status );
   }
   else if ( f == Mtm_plus_psi ||  //          Schur complement with plus mu 
-	    f == Msw_plus_psi ||  //          Schur complement with plus mu
-	    f == Mtm_minus_psi || //          Schur complement with minus mu 
-	    f == Msw_minus_psi || //          Schur complement with minus mu
-	    f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
-	    f == D_psi )          //          Full operator    with plus mu
+            f == Msw_plus_psi ||  //          Schur complement with plus mu
+            f == Mtm_minus_psi || //          Schur complement with minus mu 
+            f == Msw_minus_psi || //          Schur complement with minus mu
+            f == Msw_psi ||       //          Schur complement with mu=0 on odd sites
+            f == D_psi ||         //          Full operator    with plus mu
+            f == Msw_full_minus_psi) {//         Full operator    with minus mu
     DDalphaAMG_solve( new, old, precision, &mg_status );
+  }
   else
     DDalphaAMG_solve( new, old, precision, &mg_status );
   
@@ -386,15 +505,506 @@ static int MG_solve(spinor * const phi_new, spinor * const phi_old, const double
     finalize_solver(solver_field, 2);
   }
   
+  mul_r(phi_new ,mg_scale, phi_new, N);
+
   if (g_proc_id == 0) {
     printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
-	   100.*(mg_status.coarse_time/mg_status.time));
+           100.*(mg_status.coarse_time/mg_status.time));
     printf("Total iterations on fine grid %d\n", mg_status.iter_count);
     printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
     if (!mg_status.success) 
       printf("ERROR: the solver did not converge!\n");
   }
-  mul_r(phi_new ,mg_scale, phi_new, N);
+  
+  return mg_status.success;
+}
+
+static int MG_solve_nd( spinor * up_new, spinor * dn_new, spinor * const up_old, spinor * const dn_old,
+                        const double precision, const int N, matrix_mult_nd f)
+{
+  
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
+  // moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
+  double mg_scale=0.5/g_kappa/phmc_invmaxev;
+  double sqnorm;
+  int init_guess = 0;
+  spinor *old1 = up_old; 
+  spinor *old2 = dn_old; 
+  spinor *new1 = up_new, *new1tmp;
+  spinor *new2 = dn_new, *new2tmp;
+  spinor ** solver_field = NULL, ** oe_solver_field = NULL;
+  int no_solver_field = 0;
+
+  if( N != VOLUME && N != VOLUME/2 ) {
+    if( g_proc_id == 0 )
+      printf("ERROR: N = %d in MG_solve. Expettected N == VOLUME (%d) or VOLUME/2 (%d)\n", N, VOLUME, VOLUME/2);
+    return 0;
+  }
+
+  if (N==VOLUME/2) no_solver_field += 4;
+
+  // Checking if initial guess is given
+  sqnorm = square_norm(up_new, N, 1);
+  sqnorm += square_norm(dn_new, N, 1);
+  if ( sqnorm > 1e-14 ) init_guess = 1;
+
+  // In case of initial guess and squared operator, we do the inversion in two step and we need two more vectors
+  if ( init_guess && (
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ))  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    no_solver_field += 2;
+
+  // Allocating and assigning fields
+  if(no_solver_field>0)
+    init_solver_field(&solver_field, VOLUMEPLUSRAND,no_solver_field);
+
+  int assign_solver_field = 0;
+  if (N==VOLUME/2) {
+    old1 = solver_field[assign_solver_field++];
+    old2 = solver_field[assign_solver_field++];
+    new1 = solver_field[assign_solver_field++];
+    new2 = solver_field[assign_solver_field++];
+    convert_odd_to_lexic(old1, up_old);
+    convert_odd_to_lexic(old2, dn_old);
+    set_even_to_zero(old1);
+    set_even_to_zero(old2);
+  }
+
+  if ( init_guess && (
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift )) {// (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    new1tmp = solver_field[assign_solver_field++];
+    new2tmp = solver_field[assign_solver_field++];
+  }
+
+  // Reconstructing initial guess in case of oe
+  if ( init_guess && N==VOLUME/2 ) {
+    init_solver_field(&oe_solver_field, VOLUMEPLUSRAND, 4);
+    spinor* tmp11 = oe_solver_field[0];
+    spinor* tmp21 = oe_solver_field[1];
+    spinor* tmp12 = oe_solver_field[2];
+    spinor* tmp22 = oe_solver_field[3];
+
+    if (g_debug_level > 2) {
+      double differ[2];
+      f( tmp11, tmp12, up_new, dn_new);
+      diff( tmp11, tmp11, up_old, N);
+      diff( tmp12, tmp12, dn_old, N);
+      differ[0] = sqrt(square_norm(tmp11, N, 1)+square_norm(tmp12, N, 1));
+      differ[1] = sqrt(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1));
+  
+      if(g_proc_id == 0)
+        printf("MG TEST: using initial guess. Relative residual = %e  \n", differ[0]/differ[1]);
+    }
+
+    /* Reconstruct the even sites                */
+    if (    f == Qtm_pm_ndpsi       ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qsw_pm_ndpsi       ||  // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+            f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+#ifdef VARIANT_FOR_SHIFTED
+      if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+         && g_shift != 0 ) {
+        if( f == Qtm_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+          Qtm_tau1_ndpsi_add_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12  
+        } else {                        // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+          Qsw_tau1_ndpsi_add_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12
+        }
+        // tau1 exchange new1tmp <-> new2tmp
+        convert_odd_to_lexic( new2, tmp11);
+        convert_odd_to_lexic( new1, tmp12);
+        Hopping_Matrix(EO, tmp21, tmp11);
+        Hopping_Matrix(EO, tmp22, tmp12);
+        Msw_ee_inv_ndpsi(tmp11, tmp12, tmp21, tmp22);
+        convert_even_to_lexic(new2, tmp11);
+        convert_even_to_lexic(new1, tmp12);
+      } else
+#endif
+      {
+        // tau1 exchange tmp11 <-> tmp12
+        Hopping_Matrix(EO, tmp12, up_new);
+        Hopping_Matrix(EO, tmp11, dn_new);
+
+        Msw_ee_inv_ndpsi(tmp21, tmp22, tmp11, tmp12);
+
+        /* Assigning with plus sign for the even
+         * since in Hopping_Matrix the minus is missing
+         */
+        // tau1 exchange tmp22 <-> tmp21
+        convert_eo_to_lexic(new1, tmp22, up_new);
+        convert_eo_to_lexic(new2, tmp21, dn_new);
+      }
+    } else {
+      Hopping_Matrix(EO, tmp11, up_new);
+      Hopping_Matrix(EO, tmp12, dn_new);
+
+      Msw_ee_inv_ndpsi(tmp21, tmp22, tmp11, tmp12);
+
+      /* Assigning with plus sign for the even
+       * since in Hopping_Matrix the minus is missing
+       */
+      convert_eo_to_lexic(new1, tmp21, up_new);
+      convert_eo_to_lexic(new2, tmp22, dn_new);
+    }
+  
+    // if squared obtaining initial guess for Gamma5 Dh
+    if (    f == Qtm_pm_ndpsi       ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+      Qtm_dagger_ndpsi(tmp11, tmp12, up_new, dn_new); // tau1 Gamma5 Dh tau1
+    }
+    else if(f == Qsw_pm_ndpsi       ) { // (Gamma5 Dh tau1)^2 - Schur complement squared
+      Qsw_dagger_ndpsi(tmp11, tmp12, up_new, dn_new); // tau1 Gamma5 Dh tau1
+    }
+    else if(f == Qtm_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+      Qtm_tau1_ndpsi_sub_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12  
+    }
+    else if(f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+      Qsw_tau1_ndpsi_sub_Ishift(tmp12, tmp11, up_new, dn_new); // tau1 exchange tmp11 <-> tmp12
+    }
+
+    if (    f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ){  // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+
+      // tau1 exchange new1tmp <-> new2tmp
+      convert_odd_to_lexic( new2tmp, tmp11);
+      convert_odd_to_lexic( new1tmp, tmp12);
+      Hopping_Matrix(EO, tmp21, tmp11);
+      Hopping_Matrix(EO, tmp22, tmp12);
+      Msw_ee_inv_ndpsi(tmp11, tmp12, tmp21, tmp22);
+      convert_even_to_lexic(new2tmp, tmp11);
+      convert_even_to_lexic(new1tmp, tmp12);
+    } 
+    finalize_solver(oe_solver_field, 4);
+  } 
+
+  // Checking if the operator is in the list and compatible with N
+  if (      f == Qtm_ndpsi ||           //  Gamma5 Dh    - Schur complement with csw = 0
+            f == Qsw_ndpsi ||           //  Gamma5 Dh    - Schur complement
+            f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar and csw = 0
+            f == Qsw_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar
+            f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with minus shift
+            f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    if( N != VOLUME/2 && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME/2 for the required operator in MG_solve. Continuing with N == VOLUME\n");
+  }
+  else if ( f == D_ndpsi ) {            //  Dh
+    if( N != VOLUME && g_proc_id == 0 )
+      printf("WARNING: expected N == VOLUME for the required operator in MG_solve. Continuing with N == VOLUME/2\n");
+  }
+  else if( g_proc_id == 0 )
+    printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
+           N==VOLUME?"":"Qsw_ndpsi");
+
+  // Setting mu and eps
+  if (      f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, sqrt(g_shift) );
+  else if ( f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qsw_tau1_ndpsi_add_Ishift )  // Gamma5 Dh tau1 - Schur complement with plus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, sqrt(g_shift) );
+  else if ( f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift )  // Gamma5 Dh tau1 - Schur complement with minus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+  else if ( f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar csw = 0
+            f == Qsw_dagger_ndpsi )     //  Gamma5 Dh    - Schur complement with mu = -mubar
+    MG_update_mubar_epsbar( -g_mubar, g_epsbar, 0 );
+  else if ( f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == D_ndpsi )              //  Dh
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, 0 );
+  else
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, 0 );
+  
+  //Solving
+  if (      f == Qtm_ndpsi ||           //  Gamma5 Dh    - Schur complement with csw = 0
+            f == Qsw_ndpsi ||           //  Gamma5 Dh    - Schur complement
+            f == Qtm_dagger_ndpsi ||    //  Gamma5 Dh    - Schur complement with mu = -mubar csw = 0
+            f == Qsw_dagger_ndpsi ) {   //  Gamma5 Dh    - Schur complement with mu = -mubar
+    mul_gamma5(old1, VOLUME);
+    mul_gamma5(old2, VOLUME);
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2, 
+                                precision, &mg_status );
+    }
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+    }
+  }
+  else if ( f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    mul_gamma5(old1, VOLUME);
+    mul_gamma5(old2, VOLUME);
+    // tau1 exchange new1 <-> new2
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) old1, (double*) new1, (double*) old2, 
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new2, (double*) old1, (double*) new1, (double*) old2, 
+                                precision, &mg_status );
+    }
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+    }
+  }            
+  else if ( f == Qtm_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi ||        // (Gamma5 Dh tau1)^2 - Schur complement squared
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
+    if (init_guess) {
+      mul_gamma5(old1, VOLUME);
+      mul_gamma5(old2, VOLUME);
+      // Removing normalization from initial guess
+      mul_r(new1tmp, 1/mg_scale, new1tmp, VOLUME);
+      mul_r(new2tmp, 1/mg_scale, new2tmp, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new2tmp, (double*) old1, (double*) new1tmp, (double*) old2,
+                                           precision/2, &mg_status );
+#ifdef VARIANT_FOR_SHIFTED
+      if((  f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+         && g_shift != 0 ) {
+        // Removing normalization from initial guess
+        mul_r(new1, 1/mg_scale, new1, VOLUME);
+        mul_r(new2, 1/mg_scale, new2, VOLUME);
+        MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+        DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) old1, (double*) new1, (double*) old2,
+                                             precision/2, &mg_status );
+        assign_mul_add_mul(new1, -_Complex_I/2./sqrt(g_shift), new1tmp, _Complex_I/2./sqrt(g_shift), VOLUME);
+        assign_mul_add_mul(new2, -_Complex_I/2./sqrt(g_shift), new2tmp, _Complex_I/2./sqrt(g_shift), VOLUME);
+      } else 
+#endif
+      {
+        mul_gamma5(new1tmp, VOLUME);
+        mul_gamma5(new2tmp, VOLUME);
+        set_even_to_zero(new1tmp);
+        set_even_to_zero(new2tmp);
+        // Removing normalization from initial guess
+        mg_scale *= mg_scale;
+        mul_r(new1, 1/mg_scale, new1, VOLUME);
+        mul_r(new2, 1/mg_scale, new2, VOLUME);
+        if (      f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+                  f == Qsw_pm_ndpsi_shift )   // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+          MG_update_mubar_epsbar( g_mubar, g_epsbar, -sqrt(g_shift) );
+        DDalphaAMG_solve_doublet_with_guess( (double*) new2, (double*) new1tmp, (double*) new1, (double*) new2tmp,
+                                             precision/2, &mg_status );      
+      }
+      if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+        mul_gamma5(old1, VOLUME);
+        mul_gamma5(old2, VOLUME);
+      }
+    } else {
+      mg_scale *= mg_scale;
+      DDalphaAMG_solve_doublet_squared_odd( (double*) new2, (double*) old2, (double*) new1, (double*) old1,
+                                            precision, &mg_status );
+    }
+  }
+  else if ( f == D_ndpsi ) {            //  Dh
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                precision, &mg_status );
+    }
+  } else {
+    if (init_guess) {
+      // Removing normalization from initial guess
+      mul_r(new1, 1/mg_scale, new1, VOLUME);
+      mul_r(new2, 1/mg_scale, new2, VOLUME);
+      DDalphaAMG_solve_doublet_with_guess( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                           precision, &mg_status );
+    } else {
+      DDalphaAMG_solve_doublet( (double*) new1, (double*) old1, (double*) new2, (double*) old2,
+                                precision, &mg_status );
+    }
+  }
+  if (N==VOLUME/2) {
+    convert_lexic_to_odd(up_new, new1);
+    convert_lexic_to_odd(dn_new, new2);
+  }
+  if (no_solver_field>0)
+    finalize_solver(solver_field, no_solver_field);
+  mul_r(up_new ,mg_scale, up_new, N);
+  mul_r(dn_new ,mg_scale, dn_new, N);
+  
+  if (g_proc_id == 0) {
+    printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
+           100.*(mg_status.coarse_time/mg_status.time));
+    printf("Total iterations on fine grid %d\n", mg_status.iter_count);
+    printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
+    if (!mg_status.success) 
+      printf("ERROR: the solver did not converge!\n");
+  }
+  
+  return mg_status.success;
+}
+
+static int MG_mms_solve_nd( spinor **const up_new, spinor **const dn_new, 
+                            spinor * const up_old, spinor * const dn_old,
+                            const double * shifts, const int no_shifts,
+                            double * precision, const int N, matrix_mult_nd f)
+{
+  
+  // for rescaling  convention in DDalphaAMG: (4+m)*\delta_{x,y} in tmLQCD: 1*\delta_{x,y} -> rescale by 1/4+m
+  // moreover in the nd case, the tmLQCD is multiplied by phmc_invmaxev
+  double mg_scale=0.5/g_kappa/phmc_invmaxev;
+  double *old1 = (double*) up_old; 
+  double *old2 = (double*) dn_old; 
+  double **new1, **new2, *mg_odd_shifts, *mg_even_shifts;
+  spinor ** solver_field = NULL;
+
+  //  if( N != VOLUME && N != VOLUME/2 ) {
+  if( N != VOLUME/2 ) { // no full VOLUME functions implemented at the moment 
+    if( g_proc_id == 0 )
+      printf("ERROR: N = %d in MG_solve. Expettected N == VOLUME (%d) or VOLUME/2 (%d)\n", N, VOLUME, VOLUME/2);
+    return 0;
+  }
+
+  new1 = (double**) malloc(no_shifts*sizeof(double*));
+  new2 = (double**) malloc(no_shifts*sizeof(double*));
+  mg_odd_shifts  = (double*) malloc(no_shifts*sizeof(double));
+  mg_even_shifts = (double*) malloc(no_shifts*sizeof(double));
+
+  if( N==VOLUME/2 ) {
+    init_solver_field(&solver_field, VOLUMEPLUSRAND,2+2*no_shifts);
+    old1 = (double*) solver_field[0];
+    old2 = (double*) solver_field[1];
+    convert_odd_to_lexic( (spinor*) old1, up_old);
+    convert_odd_to_lexic( (spinor*) old2, dn_old);
+
+    for( int i = 0; i < no_shifts; i++ ) {
+      new1[i] = (double*) solver_field[2+2*i];
+      new2[i] = (double*) solver_field[3+2*i];
+    }
+  } else {
+    for( int i = 0; i < no_shifts; i++ ) {
+      new1[i] = (double*) up_new[i];
+      new2[i] = (double*) dn_new[i];
+    }
+  }
+
+  // Checking if the operator is in the list and compatible with N
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with minus shift
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    if( N != VOLUME/2 ) {
+      if( g_proc_id == 0 )
+        printf("ERROR: expected N == VOLUME/2 for the required operator in MG_mms_solve_nd.\n");
+      return 0;
+    }
+  }  else if( g_proc_id == 0 )
+    printf("WARNING: required operator unknown for MG_solve. Using standard operator: %s.\n",
+           N==VOLUME?"":"Qsw_pm_ndpsi_shift");
+
+  // Setting mubar, epsbar and shifts
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, shifts[0] );
+    for( int i = 0; i < no_shifts; i++ ) {
+      mg_odd_shifts[i]  = shifts[i]*mg_scale;
+      mg_even_shifts[i] = 0;
+    }
+  }
+  else if ( f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    MG_update_mubar_epsbar( g_mubar, g_epsbar, -shifts[0] );
+    for( int i = 0; i < no_shifts; i++ ) {
+      mg_odd_shifts[i]  = -shifts[i]*mg_scale;
+      mg_even_shifts[i] = 0;
+    }
+  }
+
+  //Solving
+  if (      f == Qtm_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and plus shift
+            f == Qtm_tau1_ndpsi_sub_Ishift || // Gamma5 Dh tau1 - Schur complement with csw = 0 and minus shift
+            f == Qsw_tau1_ndpsi_add_Ishift || // Gamma5 Dh tau1 - Schur complement with plus shift
+            f == Qsw_tau1_ndpsi_sub_Ishift ) {// Gamma5 Dh tau1 - Schur complement with minus shift
+    mul_gamma5((spinor *const) old1, VOLUME);
+    mul_gamma5((spinor *const) old2, VOLUME);
+    // tau1 exchange new1 <-> new2
+    DDalphaAMG_solve_ms_doublet( new2, old1, new1, old2, mg_even_shifts, mg_odd_shifts, no_shifts, 
+                                 precision, &mg_status );
+    if( N == VOLUME ) { // in case of VOLUME/2 old is a just local vector
+      mul_gamma5((spinor *const) old1, VOLUME);
+      mul_gamma5((spinor *const) old2, VOLUME);
+    }
+  }            
+  else if ( f == Qtm_pm_ndpsi_shift ||  // (Gamma5 Dh tau1)^2 - Schur complement squared with csw = 0 and shift
+            f == Qsw_pm_ndpsi_shift ) { // (Gamma5 Dh tau1)^2 - Schur complement squared with shift
+    mg_scale *= mg_scale;
+    // DDalphaAMG: tau1 gamma5 Dh tau1 gamma5 Dh
+    // tmLQCD:          gamma5 Dh tau1 gamma5 Dh tau1
+    DDalphaAMG_solve_ms_doublet_squared_odd( new2, old2, new1, old1, mg_even_shifts, mg_odd_shifts, no_shifts,
+                                             precision, &mg_status );
+  }
+  else
+    DDalphaAMG_solve_ms_doublet( new1, old1, new2, old2, mg_even_shifts, mg_odd_shifts, no_shifts, 
+                                 precision, &mg_status );
+
+  if (N==VOLUME/2) {
+    for( int i = 0; i < no_shifts; i++ ) {
+      convert_lexic_to_odd(up_new[i], (spinor*) new1[i]);
+      convert_lexic_to_odd(dn_new[i], (spinor*) new2[i]);
+    }
+    finalize_solver(solver_field, 2+2*no_shifts);
+  }
+
+  for( int i = 0; i < no_shifts; i++ ) {
+    mul_r(up_new[i], mg_scale, up_new[i], N);
+    mul_r(dn_new[i], mg_scale, dn_new[i], N);
+  }
+
+  if (g_proc_id == 0) {
+    printf("Solving time %.2f sec (%.1f %% on coarse grid)\n", mg_status.time,
+           100.*(mg_status.coarse_time/mg_status.time));
+    printf("Total iterations on fine grid %d\n", mg_status.iter_count);
+    printf("Total iterations on coarse grids %d\n", mg_status.coarse_iter_count);
+    if (!mg_status.success) 
+      printf("ERROR: the solver did not converge!\n");
+  }
+
+  free(new1);
+  free(new2);
+  free(mg_odd_shifts);
+  free(mg_even_shifts);
   
   return mg_status.success;
 }
@@ -418,7 +1028,7 @@ void MG_init()
   for(int i = 0; i<4; i++)
     if(mg_blk[i]==0)
       mg_blk[i]=(((L/g_nproc_x)%2==0)?(((L/g_nproc_x)%4==0)?4:2):
-		 (((L/g_nproc_x)%3==0)?3:1));
+                 (((L/g_nproc_x)%3==0)?3:1));
   
   mg_init.block_lattice[0]=mg_blk[0];
   mg_init.block_lattice[1]=mg_blk[1];
@@ -466,8 +1076,8 @@ void MG_init()
   
   if (mg_status.success!=mg_lvl) {
       if (g_proc_id == 0) {
-	  printf("MG WARNING: %d level initialized instead of %d\n",mg_status.success,mg_lvl);
-	  printf("MG WARNING: parameter: mg_lvl is changed to %d\n\n",mg_status.success);
+          printf("MG WARNING: %d level initialized instead of %d\n",mg_status.success,mg_lvl);
+          printf("MG WARNING: parameter: mg_lvl is changed to %d\n\n",mg_status.success);
       }
       mg_lvl=mg_status.success;
   }
@@ -511,15 +1121,15 @@ void MG_update_gauge(double step)
   mg_update_gauge = 1;
 }
 
-void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD)
+void MG_update_mu(double mu_tmLQCD, double shift_tmLQCD)
 {
-  double mu, odd_shift;
-  mu=0.5*mu_tmLQCD/g_kappa;
-  odd_shift=0.5*odd_tmLQCD/g_kappa;
+  double mu, shift;
+  mu    = 0.5 * mu_tmLQCD   /g_kappa;
+  shift = 0.5 * shift_tmLQCD/g_kappa;
   
   DDalphaAMG_get_parameters(&mg_params);
   
-  if (mu != mg_params.mu || odd_shift != mg_params.mu_odd_shift || mg_params.mu_even_shift != 0.0 ) {
+  if (mu != mg_params.mu || shift != mg_params.mu_odd_shift || mg_params.mu_even_shift != 0.0 || mg_params.smoother_iterations != 4 ) {
     //Taking advantage of this function for updating printing in HMC
     if(g_debug_level > 0) 
       mg_params.print=1;
@@ -528,9 +1138,43 @@ void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD)
 
     mg_params.mu = mu;
     mg_params.mu_even_shift = 0.0;
-    mg_params.mu_odd_shift = odd_shift;
+    mg_params.mu_odd_shift = shift;
+    mg_params.mu_factor[mg_lvl-1] = mg_cmu_factor;
+    mg_params.epsbar = 0.0;
+    mg_params.epsbar_ig5_even_shift = 0.0;
+    mg_params.epsbar_ig5_odd_shift = 0.0;
+    mg_params.smoother_iterations = 4;
+    DDalphaAMG_update_parameters(&mg_params, &mg_status);
+  }         
+}
+
+void MG_update_mubar_epsbar(double mubar_tmLQCD, double epsbar_tmLQCD, double shift_tmLQCD)
+{
+  double mubar, epsbar, shift;
+  mubar  = 0.5 * mubar_tmLQCD /g_kappa;
+  epsbar = 0.5 * epsbar_tmLQCD/g_kappa;
+  shift  = 0.5 * shift_tmLQCD/g_kappa/phmc_invmaxev;
+  
+  DDalphaAMG_get_parameters(&mg_params);
+  
+  if ( mubar != mg_params.mu || mg_params.mu_odd_shift != 0.0 || mg_params.mu_even_shift != 0.0 ||
+       epsbar != mg_params.epsbar || shift != mg_params.epsbar_ig5_odd_shift || mg_params.epsbar_ig5_even_shift != 0.0 || mg_params.smoother_iterations != 2 ) {
+    //Taking advantage of this function for updating printing in HMC
+    if(g_debug_level > 0) 
+      mg_params.print=1;
+    else
+      mg_params.print=0;
+
+    mg_params.mu = mubar;
+    mg_params.mu_even_shift = 0.0;
+    mg_params.mu_odd_shift = 0.0;
+    mg_params.mu_factor[mg_lvl-1] = 1.0;
+    mg_params.epsbar = epsbar;
+    mg_params.epsbar_ig5_even_shift = 0.0;
+    mg_params.epsbar_ig5_odd_shift = shift;
+    mg_params.smoother_iterations = 2;
     DDalphaAMG_update_parameters(&mg_params, &mg_status);
-  }	 
+  }         
 }
 
 void MG_reset() {
@@ -552,8 +1196,8 @@ void MG_finalize()
 
 
 int MG_solver(spinor * const phi_new, spinor * const phi_old,
-	      const double precision, const int max_iter,const int rel_prec,
-	      const int N, su3 **gf, matrix_mult f)
+              const double precision, const int max_iter,const int rel_prec,
+              const int N, su3 **gf, matrix_mult f)
 {
   
   int success=0;
@@ -562,21 +1206,17 @@ int MG_solver(spinor * const phi_new, spinor * const phi_old,
   MG_pre_solve(gf);
 
   success = MG_solve( phi_new, phi_old, mg_prec, N, f );
-  
-#ifdef MGTEST
-  if(success) 
+
+  if(success && g_debug_level > 2) 
     success = MG_check( phi_new, phi_old, N, mg_prec, f );
-#endif
   
   if(!success) {
     MG_reset();
     MG_pre_solve(gf);
     success = MG_solve( phi_new, phi_old, mg_prec, N, f);
     
-#ifdef MGTEST
-    if(success) 
+    if(success && g_debug_level > 2) 
       success = MG_check( phi_new, phi_old, N, mg_prec, f );
-#endif
   }
   
   if(!success) {
@@ -593,9 +1233,9 @@ int MG_solver(spinor * const phi_new, spinor * const phi_old,
 }
 
 int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
-		 spinor * const Even, spinor * const Odd,
-		 const double precision, const int max_iter, const int rel_prec,
-		 const int N, su3 **gf, matrix_mult_full f_full)
+                 spinor * const Even, spinor * const Odd,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_full f_full)
 {
   
   int iter_count;
@@ -611,6 +1251,8 @@ int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
     f=&Q_plus_psi;
   else if (f_full == Msw_full)
     f=&D_psi;
+  else if (f_full == Qsw_full)
+    f=&Qsw_full_plus_psi;
   else {
     f=&D_psi;
     if( g_proc_id == 0 )
@@ -625,4 +1267,133 @@ int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
   return iter_count;
 }
 
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+                 spinor * const up_old, spinor * const dn_old,
+                 const double precision, const int max_iter, const int rel_prec,
+                 const int N, su3 **gf, matrix_mult_nd f)
+{
+  
+  int success=0;
+  double mg_prec = rel_prec?sqrt(precision):sqrt(precision/(square_norm(up_old, N, 1)+square_norm(dn_old, N, 1)));
+  
+  MG_pre_solve(gf);
+
+  success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f );
+  
+  if(success && g_debug_level > 2) {
+    success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+
+    if(!success) {
+      success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f);
+    
+      if(success) 
+        success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+    }
+  }
+  
+  if(!success) {
+    MG_reset();
+    MG_pre_solve(gf);
+    success = MG_solve_nd( up_new, dn_new, up_old, dn_old, mg_prec, N, f);
+    
+    if(success && g_debug_level > 2) 
+      success = MG_check_nd( up_new, dn_new, up_old, dn_old, N, mg_prec, f );
+  }
+  
+  if(!success) {
+    if( g_proc_id == 0 )
+      printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
+    //TODO: handle abort
+    DDalphaAMG_finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Finalize();
+    exit(1);
+  } 
+  // mg_status should have been used last time for the inversion.
+  return mg_status.iter_count;
+}
+
+int MG_solver_nd_eo(spinor * const Even_new_up, spinor * const Odd_new_up, 
+                    spinor * const Even_new_dn, spinor * const Odd_new_dn,
+                    spinor * const Even_up, spinor * const Odd_up,
+                    spinor * const Even_dn, spinor * const Odd_dn,
+                    const double precision, const int max_iter, const int rel_prec,
+                    const int N, su3 **gf, matrix_mult_full_nd f_full)
+{
+  
+  int iter_count;
+  spinor ** solver_field = NULL;
+  matrix_mult_nd f;
+  
+  init_solver_field(&solver_field, VOLUMEPLUSRAND, 4);
+  convert_eo_to_lexic(solver_field[0], Even_up, Odd_up);
+  convert_eo_to_lexic(solver_field[1], Even_dn, Odd_dn);
+  
+  if (f_full == M_full_ndpsi)
+    f=&D_ndpsi;
+  else if (f_full == Msw_full_ndpsi)
+    f=&D_ndpsi;
+  else {
+    f=&D_ndpsi;
+    if( g_proc_id == 0 )
+      printf("WARNING: required operator unknown for MG_solver_eo. Using standard operator.\n");
+  }
+
+  iter_count = MG_solver_nd( solver_field[2], solver_field[3], solver_field[0], solver_field[1], precision, max_iter,
+                             rel_prec, VOLUME, gf, f );
+  
+  convert_lexic_to_eo(Even_new_up, Odd_new_up, solver_field[2]);
+  convert_lexic_to_eo(Even_new_dn, Odd_new_dn, solver_field[3]);
+  finalize_solver(solver_field, 4);
+  
+  return iter_count;
+}
+
+int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new,
+                     spinor * const up_old, spinor * const dn_old,
+                     const double * shifts, const int no_shifts,
+                     const double * precision, const int max_iter, const int rel_prec,
+                     const int N, su3 **gf, matrix_mult_nd f)
+{
+  
+  int success=0;
+  double mg_prec[no_shifts];
+  if(rel_prec) {
+    for(int i=0; i<no_shifts; i++)
+      mg_prec[i] = sqrt(precision[i]);
+  } else {
+    double nrhs = square_norm(up_old, N, 1)+square_norm(dn_old, N, 1);
+    for(int i=0; i<no_shifts; i++)
+      mg_prec[i] = sqrt(precision[i]/nrhs);
+  }  
+
+  MG_pre_solve(gf);
+
+  success = MG_mms_solve_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, mg_prec, N, f );
+  
+  if(success && g_debug_level > 2) 
+    success = MG_mms_check_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, N, mg_prec, f );
+  
+  if(!success) {
+    MG_reset();
+    MG_pre_solve(gf);
+    success = MG_mms_solve_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, mg_prec, N, f);
+    
+    if(success && g_debug_level > 2) 
+      success = MG_mms_check_nd( up_new, dn_new, up_old, dn_old, shifts, no_shifts, N, mg_prec, f );
+  }
+  
+  if(!success) {
+    if( g_proc_id == 0 )
+      printf("ERROR: solver didn't converge after two trials!! Aborting... \n");
+    //TODO: handle abort
+    DDalphaAMG_finalize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Finalize();
+    exit(1);
+  } 
+  // mg_status should have been used last time for the inversion.
+  return mg_status.iter_count;
+}
+
 #endif
diff --git a/DDalphaAMG_interface.h b/DDalphaAMG_interface.h
index 144e96b4c..0fa8e75ca 100644
--- a/DDalphaAMG_interface.h
+++ b/DDalphaAMG_interface.h
@@ -26,16 +26,20 @@
 #include "global.h"
 #include "su3.h"
 #include"solver/matrix_mult_typedef.h"
+#include"solver/matrix_mult_typedef_nd.h"
 
 extern int mg_setup_iter;
 extern int mg_coarse_setup_iter;
 extern int mg_update_setup_iter;
+extern int mg_update_gauge;
 extern int mg_omp_num_threads;
 extern int mg_Nvec;
 extern int mg_lvl;
 extern int mg_blk[4];
 extern int mg_mixed_prec;
 extern int mg_setup_mu_set;
+extern int mg_no_shifts;
+extern double mg_mms_mass;
 extern double mg_setup_mu;
 extern double mg_cmu_factor;
 extern double mg_dtau_update;
@@ -44,6 +48,7 @@ extern double mg_rho_update;
 void MG_init(void);
 void MG_update_gauge(double step);
 void MG_update_mu(double mu_tmLQCD, double odd_tmLQCD);
+void MG_update_mubar_epsbar(double mubar_tmLQCD, double epsbar_tmLQCD, double shift_tmLQCD);
 void MG_reset(void);
 void MG_finalize(void);
 
@@ -56,4 +61,22 @@ int MG_solver_eo(spinor * const Even_new, spinor * const Odd_new,
 		 const double precision, const int max_iter, const int rel_prec,
 		 const int N, su3 **gf, matrix_mult_full f_full);
 
+int MG_solver_nd(spinor * const up_new, spinor * const dn_new,
+		 spinor * const up_old, spinor * const dn_old,
+		 const double precision, const int max_iter, const int rel_prec,
+		 const int N, su3 **gf, matrix_mult_nd f);
+
+int MG_solver_nd_eo(spinor * const Even_new_up, spinor * const Odd_new_up, 
+                    spinor * const Even_new_dn, spinor * const Odd_new_dn,
+                    spinor * const Even_up, spinor * const Odd_up,
+                    spinor * const Even_dn, spinor * const Odd_dn,
+                    const double precision, const int max_iter, const int rel_prec,
+                    const int N, su3 **gf, matrix_mult_full_nd f_full);
+
+int MG_mms_solver_nd(spinor **const up_new, spinor **const dn_new,
+                     spinor * const up_old, spinor * const dn_old,
+                     const double * shifts, const int no_shifts,
+                     const double * precision, const int max_iter, const int rel_prec,
+                     const int N, su3 **gf, matrix_mult_nd f);
+
 #endif /* DDalphaAMG_INTERFACE_H_ */
diff --git a/default_input_values.h b/default_input_values.h
index c11e759cd..f9903da8f 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -50,6 +50,7 @@
 #define _default_g_mu1 0.0
 #define _default_g_mu2 0.0
 #define _default_g_mu3 0.0
+#define _default_g_shift 0.0
 #define _default_c_sw -1.0
 #define _default_g_beta 6.0
 #define _default_g_N_s 20
diff --git a/doc/DDalphaAMG.tex b/doc/DDalphaAMG.tex
index 7fac1eda2..0a5f2cd64 100644
--- a/doc/DDalphaAMG.tex
+++ b/doc/DDalphaAMG.tex
@@ -88,6 +88,8 @@ \subsubsection{More advanced settings}
 	\item[\texttt{MGdtauUpdate:}] for HMC, $d\tau$ interval after that the setup is updated. If 0, it will be updated every time the configuration is changed.
 	\item[\texttt{MGrhoUpdate:}] for HMC, rho value of the monomial at which the setup have to be updated. It can be combined with \texttt{MGdtauUpdate} or used standalone.
 	\item[\texttt{MGUpdateSetupIter:}] for HMC, number of setup iterations to do on the fine level when the setup has to be updated.
+	\item[\texttt{MGNumberOfShifts:}] for MG in multi-shift systems, number of shifted linear systems, N, to be solved by DDalphaAMG. MG will solve the N smaller shifts.
+	\item[\texttt{MGMMSMass:}] for MG in multi-shift systems, alternative to the previous. MG will solve all the mass-shifts smaller than the given value.
 \end{description}
 \subsubsection{Output analysis\label{sec:DDalphaAMG_output}}
 Running tmLQCD programs with the option \texttt{-v}, the full output of DDalphaAMG is shown. Here some hints on the informations given. Just before the setup, the full set of parameters is printed, with an output similar to the following:
diff --git a/expo.c b/expo.c
index dcac9a983..93f99cc9d 100644
--- a/expo.c
+++ b/expo.c
@@ -52,48 +52,132 @@
 #include "su3.h"
 #include "su3adj.h"
 #include "expo.h"
+#include "float.h"
+#include "global.h"
 
-void exposu3(su3* const vr, const su3adj* const p) {
-  int i;
-  su3 ALIGN v,v2;
-  double ALIGN fac,r;
-  double ALIGN a,b;
-  _Complex double ALIGN a0,a1,a2,a1p;
+static double imag_det(const su3adj* p) {
+  double d,tos3,o3,os3;
+  tos3=2.0/sqrt(3.0);
+  o3=1.0/3.0;
+  os3=1.0/sqrt(3.0);
+  
+  d=tos3*(*p).d8*(o3*(*p).d8*(*p).d8-(*p).d3*(*p).d3)+2*((*p).d2*(*p).d4*(*p).d7-(*p).d1*(*p).d4*(*p).d6-(*p).d2*(*p).d5*(*p).d6-(*p).d1*(*p).d5*(*p).d7);
+  d+=(os3*(*p).d8-(*p).d3)*((*p).d4*(*p).d4+(*p).d5*(*p).d5)+(os3*(*p).d8+(*p).d3)*((*p).d6*(*p).d6+(*p).d7*(*p).d7)-tos3*(*p).d8*((*p).d1*(*p).d1+(*p).d2*(*p).d2);	
+  return d;
+}
+
+static void mul_su3alg(su3adj* p,double d) {
+  (*p).d1*=d;
+  (*p).d2*=d;
+  (*p).d3*=d;
+  (*p).d4*=d;
+  (*p).d5*=d;
+  (*p).d6*=d;
+  (*p).d7*=d;
+  (*p).d8*=d;
+}
 
-  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v' */  
+void init_exposu3() {
+  int k;
+  double fctr = 1.0;
+  g_exposu3_no_c = 0;
+  
+  while (fctr>DBL_EPSILON) {
+    g_exposu3_no_c++;
+    fctr/=(double)(g_exposu3_no_c);
+  }
+  g_exposu3_no_c += 7;
+  g_exposu3_no_c += (g_exposu3_no_c%2);
+  
+  g_exposu3_c=malloc((g_exposu3_no_c+1)*sizeof(*g_exposu3_c));
+  
+  g_exposu3_c[0]=1.0;
+  for (k=0; k < g_exposu3_no_c; k++)
+    g_exposu3_c[k+1]=g_exposu3_c[k]/(double)(k+1);
+}
+
+void exposu3(su3* const vr, const su3adj* const p) {
+  int n,m,mm;
+  su3 ALIGN v,v2,vt;
+  su3adj pa;
+  double ALIGN d,tc;
+  _Complex double t;
+  _Complex double ALIGN p0,p1,p2;
+  _Complex double ALIGN q0,q1,q2;
+  
   _make_su3(v,*p);
+  _su3_times_su3(v2,v,v);
+  tc = -2.0*(v2.c00 +v2.c11+v2.c22);
+  
+  pa.d1=(*p).d1;
+  pa.d2=(*p).d2;
+  pa.d3=(*p).d3;
+  pa.d4=(*p).d4;
+  pa.d5=(*p).d5;
+  pa.d6=(*p).d6;
+  pa.d7=(*p).d7;
+  pa.d8=(*p).d8;
+  
+  mm=0;
+  while (tc>1.0) {
+    mul_su3alg(&pa,0.5);
+    tc*=0.5;
+    mm+=1;
+  }
+  
+  /* it writes 'p=vec(h_{j,mu})' in matrix form 'v'  */
+  _make_su3(v,pa);
   /* calculates v^2 */
   _su3_times_su3(v2,v,v);
-  /* */
-  a = 0.5 * (creal(v2.c00) + creal(v2.c11) + creal(v2.c22));
-  /* 1/3 imaginary part of tr v*v2 */
-  b = 0.33333333333333333 * cimag(v.c00 * v2.c00 + v.c01 * v2.c10 + v.c02 * v2.c20 +
-                                  v.c10 * v2.c01 + v.c11 * v2.c11 + v.c12 * v2.c21 +
-                                  v.c20 * v2.c02 + v.c21 * v2.c12 + v.c22 * v2.c22  );
-  a0  = 0.16059043836821615e-9;
-  a1  = 0.11470745597729725e-10;
-  a2  = 0.76471637318198165e-12;
-  fac = 0.20876756987868099e-8;      /*  1/12! */
-  r   = 12.0;
-  for(i = 3; i <= 15; ++i)
-  {
-    a1p = a0 + a * a2;
-    a0 = fac + b * I * a2;
-    a2 = a1;
-    a1 = a1p;
-    fac *= r;
-    r -= 1.0;
+  /* t= -tr(X^2)/2*/
+  t = -0.5*(v2.c00 +v2.c11+v2.c22);
+  /* d= -1i * det(X)*/
+  d=-imag_det(&pa);
+ /*  printf(" d= %.16f and t=%.16f + 1i %.16f \n",d,creal(t),cimag(t));*/
+  
+  if(fabs(d)>(1.000001*(1.000002-fabs(t))))
+    printf("The norm of X is larger than 1 and N = %d \n", g_exposu3_no_c);
+  
+  
+  p0=g_exposu3_c[g_exposu3_no_c];
+  p1=0.0;
+  p2=0.0;
+  
+  for (n=(g_exposu3_no_c-1);n>=0;n--) {
+    q0=p0;
+    q1=p1;
+    q2=p2;
+    
+    p0=g_exposu3_c[n]-I*d*q2;
+    p1=q0-t*q2;
+    p2=q1;
   }
+   
   /* vr = a0 + a1*v + a2*v2 */
-  vr->c00 = a0 + a1 * v.c00 + a2 * v2.c00;
-  vr->c01 =      a1 * v.c01 + a2 * v2.c01;
-  vr->c02 =      a1 * v.c02 + a2 * v2.c02;
-  vr->c10 =      a1 * v.c10 + a2 * v2.c10;
-  vr->c11 = a0 + a1 * v.c11 + a2 * v2.c11;
-  vr->c12 =      a1 * v.c12 + a2 * v2.c12;
-  vr->c20 =      a1 * v.c20 + a2 * v2.c20;
-  vr->c21 =      a1 * v.c21 + a2 * v2.c21;
-  vr->c22 = a0 + a1 * v.c22 + a2 * v2.c22;
+  vt.c00 = p0 + p1 * v.c00 + p2 * v2.c00;
+  vt.c01 =      p1 * v.c01 + p2 * v2.c01;
+  vt.c02 =      p1 * v.c02 + p2 * v2.c02;
+  vt.c10 =      p1 * v.c10 + p2 * v2.c10;
+  vt.c11 = p0 + p1 * v.c11 + p2 * v2.c11;
+  vt.c12 =      p1 * v.c12 + p2 * v2.c12;
+  vt.c20 =      p1 * v.c20 + p2 * v2.c20;
+  vt.c21 =      p1 * v.c21 + p2 * v2.c21;
+  vt.c22 = p0 + p1 * v.c22 + p2 * v2.c22;
+  
+  for(m=0;m<mm;m++) {
+    _su3_times_su3(v2,vt,vt);
+    vt=v2;
+  }
+  
+  vr->c00=vt.c00;
+  vr->c01=vt.c01; 
+  vr->c02=vt.c02; 
+  vr->c10=vt.c10;
+  vr->c11=vt.c11;
+  vr->c12=vt.c12;
+  vr->c20=vt.c20;
+  vr->c21=vt.c21;
+  vr->c22=vt.c22;
 }
 
 void exposu3_check(su3* const vr, const su3adj* const p, int im) {
@@ -135,6 +219,12 @@ void restoresu3(su3* const vr, const su3* const u) {
   vr->c20 = conj(vr->c01 * vr->c12 - vr->c02 * vr->c11);
   vr->c21 = conj(vr->c02 * vr->c10 - vr->c00 * vr->c12);
   vr->c22 = conj(vr->c00 * vr->c11 - vr->c01 * vr->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  vr->c10 = conj(vr->c21 * vr->c02 - vr->c22 * vr->c01);
+  vr->c11 = conj(vr->c22 * vr->c00 - vr->c20 * vr->c02);
+  vr->c12 = conj(vr->c20 * vr->c01 - vr->c21 * vr->c00);
+
 }
 
 void restoresu3_in_place(su3* const u) {
@@ -156,6 +246,12 @@ void restoresu3_in_place(su3* const u) {
   u->c20 = conj(u->c01 * u->c12 - u->c02 * u->c11);
   u->c21 = conj(u->c02 * u->c10 - u->c00 * u->c12);
   u->c22 = conj(u->c00 * u->c11 - u->c01 * u->c10);
+
+  /* compute  row 2 as the conjugate of the cross-product of 3 and 1 */
+  u->c10 = conj(u->c21 * u->c02 - u->c22 * u->c01);
+  u->c11 = conj(u->c22 * u->c00 - u->c20 * u->c02);
+  u->c12 = conj(u->c20 * u->c01 - u->c21 * u->c00);
+
 }
                                 
 /* Exponentiates a hermitian 3x3 matrix Q */
diff --git a/expo.h b/expo.h
index dd0c3657f..8e5c1eef3 100644
--- a/expo.h
+++ b/expo.h
@@ -19,10 +19,11 @@
 #ifndef _EXPO_H
 #define _EXPO_H
 
-extern void exposu3(su3* const vr, const su3adj* const p);
-extern void exposu3_check(su3* const vr, const su3adj* const p, int im);
-extern void restoresu3(su3* const vr, const su3* const u);
-extern void restoresu3_in_place(su3* const u);
-extern void exposu3_in_place(su3* const u);
+void init_exposu3();
+void exposu3(su3* const vr, const su3adj* const p);
+void exposu3_check(su3* const vr, const su3adj* const p, int im);
+void restoresu3(su3* const vr, const su3* const u);
+void restoresu3_in_place(su3* const u);
+void exposu3_in_place(su3* const u);
 
 #endif
diff --git a/global.h b/global.h
index 666ab08a9..e814818f1 100644
--- a/global.h
+++ b/global.h
@@ -195,7 +195,7 @@ EXTERN su3adj ** ddummy;
 
 EXTERN int count00,count01,count10,count11,count20,count21;
 EXTERN double g_kappa, g_c_sw, g_beta;
-EXTERN double g_mu, g_mu1, g_mu2, g_mu3;
+EXTERN double g_mu, g_mu1, g_mu2, g_mu3, g_shift;
 EXTERN double g_rgi_C0, g_rgi_C1;
 
 /* Parameters for non-degenrate case */
@@ -212,6 +212,10 @@ EXTERN int g_mpi_z_rank;
 EXTERN int g_mpi_ST_rank;
 EXTERN int g_nb_list[8];
 
+/* Variables for exposu3 */
+EXTERN int g_exposu3_no_c;
+EXTERN double * g_exposu3_c;
+
 /* OpenMP Kahan accumulation arrays */
 EXTERN _Complex double *g_omp_acc_cp;
 EXTERN double* g_omp_acc_re;
@@ -282,3 +286,14 @@ void fatal_error(char const *error, char const *function);
 
 #endif
 
+/*
+ * Comments: generic macro for swapping values or pointers.
+ * We use memcpy because is optimal when the amount to copy is known at compilation time. 
+ * "sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1" is a compile time check that the types are compatible.
+ */
+#define SWAP(x,y) do \
+{ unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \
+  memcpy(swap_temp,&y,sizeof(x)); \
+  memcpy(&y,&x,       sizeof(x)); \
+  memcpy(&x,swap_temp,sizeof(x)); \
+} while(0)
diff --git a/init/init_gauge_field.c b/init/init_gauge_field.c
index a4cdade3c..98eae6f2d 100644
--- a/init/init_gauge_field.c
+++ b/init/init_gauge_field.c
@@ -27,6 +27,7 @@
 #include "su3.h"
 #include "sse.h"
 #include "init_gauge_field.h"
+#include "expo.h"
 
 su3 * gauge_field = NULL;
 su3_32 * gauge_field_32 = NULL;
@@ -48,6 +49,8 @@ int init_gauge_field(const int V, const int back) {
   g_gauge_field_copy = NULL;
 #endif
 
+  if (g_exposu3_no_c == 0) init_exposu3();
+
   if((void*)(g_gauge_field = (su3**)calloc(V, sizeof(su3*))) == NULL) {
     printf ("malloc errno : %d\n",errno); 
     errno = 0;
diff --git a/init/init_stout_smear_vars.c b/init/init_stout_smear_vars.c
index 3bbb986a9..23ae299f8 100644
--- a/init/init_stout_smear_vars.c
+++ b/init/init_stout_smear_vars.c
@@ -28,6 +28,7 @@
 #include "global.h"
 #include "su3.h"
 #include "sse.h"
+#include "expo.h"
 #include "init_stout_smear_vars.h"
 
 su3 * gauge_field_saved;
@@ -91,6 +92,8 @@ int init_stout_smear_vars(const int V, const int stout_no_iter)
   k = 0;
   mu = 0;
 
+  if (g_exposu3_no_c == 0) init_exposu3();
+
   /*
    *  this is the field where we store the smeared force matrices \Sigma^{(k)}_\mu(x)
    *  eqtn (44) hep-lat/0311018
diff --git a/invert.c b/invert.c
index 0cfd814e6..9980e290f 100644
--- a/invert.c
+++ b/invert.c
@@ -94,6 +94,7 @@
 #endif
 #include "meas/measurements.h"
 #include "source_generation.h"
+#include "expo.h"
 
 #define CONF_FILENAME_LENGTH 500
 
@@ -179,7 +180,6 @@ int main(int argc, char *argv[])
   j = init_gauge_field(VOLUMEPLUSRAND, 0);
   j += init_gauge_field_32(VOLUMEPLUSRAND, 0);  
 #endif
- 
   if (j != 0) {
     fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
     exit(-1);
@@ -296,7 +296,6 @@ int main(int argc, char *argv[])
       exit(-2);
     }
 
-
     if (g_cart_id == 0) {
       printf("# Finished reading gauge field.\n");
       fflush(stdout);
diff --git a/invert_doublet_eo.c b/invert_doublet_eo.c
index 26ec8f96e..b3b86ab3f 100644
--- a/invert_doublet_eo.c
+++ b/invert_doublet_eo.c
@@ -50,6 +50,9 @@
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
 #endif
+#ifdef DDalphaAMG
+#  include "DDalphaAMG_interface.h"
+#endif
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
@@ -85,6 +88,15 @@ int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
                                    sloppy, compression );
   }
 #endif
+
+#ifdef DDalphaAMG
+  if( solver_flag==MG ) {
+    return MG_solver_nd_eo( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                            Even_s, Odd_s, Even_c, Odd_c,
+                            precision, max_iter, rel_prec,
+                            VOLUME/2, g_gauge_field, M_full_ndpsi );
+  }
+#endif
   
 #ifdef HAVE_GPU
 #  ifdef TEMPORALGAUGE
@@ -210,6 +222,15 @@ int invert_cloverdoublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s,
                                    sloppy, compression );
   }
 #endif
+
+#ifdef DDalphaAMG
+  if( solver_flag==MG ) {
+    return MG_solver_nd_eo( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c,
+                            Even_s, Odd_s, Even_c, Odd_c,
+                            precision, max_iter, rel_prec,
+                            VOLUME/2, g_gauge_field, Msw_full_ndpsi );
+  }
+#endif
   
   /* here comes the inversion using even/odd preconditioning */
   if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);}
diff --git a/invert_eo.c b/invert_eo.c
index a0e62806b..ef5d156e7 100644
--- a/invert_eo.c
+++ b/invert_eo.c
@@ -485,7 +485,7 @@ int invert_eo(spinor * const Even_new, spinor * const Odd_new,
       if(g_proc_id == 0) {printf("# Using multi mass CG!\n"); fflush(stdout);}
       
       gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME);
-      iter = cg_mms_tm(P, g_spinor_field[DUM_DERI+1],&solver_params,&cgmms_reached_prec);
+      iter = cg_mms_tm(P, g_spinor_field[DUM_DERI+1],&solver_params);
       g_mu = shifts[0];
       Q_minus_psi(g_spinor_field[DUM_DERI+1], P[0]);
       
diff --git a/linalg/Makefile.in b/linalg/Makefile.in
index cdbb4ac2d..11339b10e 100644
--- a/linalg/Makefile.in
+++ b/linalg/Makefile.in
@@ -34,7 +34,7 @@ liblinalg_TARGETS = assign_add_mul_r_add_mul \
 	assign_mul_bra_add_mul_ket_add_r \
 	scalar_prod_r scalar_prod_i \
 	square_and_prod_r assign_mul_bra_add_mul_r mul_r mul_r_32 \
-	diff_and_square_norm assign \
+	diff_and_square_norm square_and_minmax assign \
 	scalar_prod mul_diff_r mul_diff_mul assign_add_mul assign_mul_add add \
 	assign_diff_mul mul_add_mul mul assign_add_mul_add_mul \
 	assign_mul_bra_add_mul_ket_add assign_mul_add_mul_add_mul_add_mul_r \
@@ -46,8 +46,8 @@ liblinalg_TARGETS = assign_add_mul_r_add_mul \
 	assign_mul_add_r_and_square \
 	addto_32 scalar_prod_r_32 assign_mul_add_r_32 assign_add_mul_r_32 \
 	square_norm_32 assign_to_32 diff_32 \
-	convert_odd_to_lexic set_even_to_zero mul_gamma5 \
-	mul_r_gamma5
+	convert_odd_to_lexic convert_even_to_lexic set_even_to_zero \
+	mul_gamma5 mul_r_gamma5
 
 liblinalg_STARGETS = diff assign_add_mul_r assign_mul_add_r square_norm
 
diff --git a/linalg/assign_add_mul_r_32.c b/linalg/assign_add_mul_r_32.c
index e60706ea1..15b7626f1 100644
--- a/linalg/assign_add_mul_r_32.c
+++ b/linalg/assign_add_mul_r_32.c
@@ -37,13 +37,8 @@
 #include "su3.h"
 #include "assign_add_mul_r_32.h"
 
-
 #if (defined BGQ && defined XLC)
-void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) {
-#ifdef TM_USE_OMP
-#pragma omp parallel
-  {
-#endif
+void assign_add_mul_r_32_orphaned(spinor32 * const R, spinor32 * const S, const float c, const int N) {
   vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
   vector4double z0, z1, z2, z3, z4, z5, k;
   float *s, *r;
@@ -93,20 +88,13 @@ void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c,
     vec_st(z4, 0, r+16);
     vec_st(z5, 0, r+20);
   }
-#ifdef TM_USE_OMP
-  } /* OpenMP closing brace */
-#endif
   return;
 }
 
 #else
 
-void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N)
+void assign_add_mul_r_32_orphaned(spinor32 * const R, spinor32 * const S, const float c, const int N)
 {
-#ifdef TM_USE_OMP
-#pragma omp parallel
-  {
-#endif
   spinor32 *r,*s;
 
 #ifdef TM_USE_OMP
@@ -134,10 +122,20 @@ void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c,
     r->s3.c2 += c * s->s3.c2;
   }
 
+}
+
+#endif
+
+void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N)
+{
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  assign_add_mul_r_32_orphaned(R,S,c,N);
 #ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
-
+return;
 }
 
-#endif
diff --git a/linalg/convert_even_to_lexic.c b/linalg/convert_even_to_lexic.c
new file mode 100644
index 000000000..1979e5deb
--- /dev/null
+++ b/linalg/convert_even_to_lexic.c
@@ -0,0 +1,108 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifdef MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+#endif
+#include "global.h"
+#include "su3.h"
+#include "convert_even_to_lexic.h"
+
+void convert_even_to_lexic(spinor * const P, spinor * const r) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
+	       p = r;
+	       memcpy((P+ix), (p+i), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /*OpenMP closing brace */
+#endif
+
+  return;
+}
+
+/*
+ *      P: spinor with full volume 
+ *      r: new spinor even
+ */
+void convert_lexic_to_even(spinor * const r, spinor * const P) {
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+
+  int x, y, z, t, i, ix;
+  spinor * p = NULL;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(x = 0; x < LX; x++) {
+    for(y = 0; y < LY; y++) {
+      for(z = 0; z < LZ; z++) {
+	for(t = 0; t < T; t++) {
+	  ix = g_ipt[t][x][y][z];
+	  i = g_lexic2eosub[ ix ];
+	  if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY 
+	      + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) {
+	    p = r;
+	    memcpy((p+i), (P+ix), sizeof(spinor));
+	  }
+	}
+      }
+    }
+  }
+
+#ifdef TM_USE_OMP
+  } /* OpenMP closing brace */
+#endif
+
+  return;
+}
diff --git a/linalg/convert_even_to_lexic.h b/linalg/convert_even_to_lexic.h
new file mode 100644
index 000000000..04eb066c0
--- /dev/null
+++ b/linalg/convert_even_to_lexic.h
@@ -0,0 +1,26 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _CONVERT_EVEN_TO_LEXIC_H
+#define _CONVERT_EVEN_TO_LEXIC_H
+
+void convert_even_to_lexic(spinor * const P, spinor * const r);
+void convert_lexic_to_even(spinor * const r, spinor * const P);
+
+#endif
diff --git a/linalg/convert_odd_to_lexic.c b/linalg/convert_odd_to_lexic.c
index 4280dad20..84155a92f 100644
--- a/linalg/convert_odd_to_lexic.c
+++ b/linalg/convert_odd_to_lexic.c
@@ -26,7 +26,7 @@
 #ifdef MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -34,7 +34,7 @@
 #include "convert_odd_to_lexic.h"
 
 void convert_odd_to_lexic(spinor * const P, spinor * const r) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -42,7 +42,7 @@ void convert_odd_to_lexic(spinor * const P, spinor * const r) {
   int x, y, z, t, i, ix;
   spinor * p = NULL;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < LX; x++) {
@@ -61,7 +61,7 @@ void convert_odd_to_lexic(spinor * const P, spinor * const r) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 
@@ -73,7 +73,7 @@ void convert_odd_to_lexic(spinor * const P, spinor * const r) {
  *      r: new spinor odd 
  */
 void convert_lexic_to_odd(spinor * const r, spinor * const P) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -81,7 +81,7 @@ void convert_lexic_to_odd(spinor * const r, spinor * const P) {
   int x, y, z, t, i, ix;
   spinor * p = NULL;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < LX; x++) {
@@ -100,7 +100,7 @@ void convert_lexic_to_odd(spinor * const r, spinor * const P) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /* OpenMP closing brace */
 #endif
 
diff --git a/linalg/mul_gamma5.c b/linalg/mul_gamma5.c
index 37c229b73..a8c77d1ac 100644
--- a/linalg/mul_gamma5.c
+++ b/linalg/mul_gamma5.c
@@ -28,7 +28,7 @@
 #ifdef HAVE_CONFIG_H
 # include<config.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include <stdlib.h>
@@ -38,7 +38,7 @@
 #include "mul_r.h"
 
 void mul_gamma5(spinor * const R, const int N){
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -46,7 +46,7 @@ void mul_gamma5(spinor * const R, const int N){
   int ix;
   spinor *r;
   
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for (ix = 0; ix < N; ix++){
@@ -60,7 +60,7 @@ void mul_gamma5(spinor * const R, const int N){
     r->s3.c1 = -1.0*r->s3.c1;
     r->s3.c2 = -1.0*r->s3.c2;
   }
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 
diff --git a/linalg/set_even_to_zero.c b/linalg/set_even_to_zero.c
index 1cebe3eb3..f0e39ac97 100644
--- a/linalg/set_even_to_zero.c
+++ b/linalg/set_even_to_zero.c
@@ -26,7 +26,7 @@
 #ifdef MPI
 # include <mpi.h>
 #endif
-#ifdef OMP
+#ifdef TM_USE_OMP
 # include <omp.h>
 #endif
 #include "global.h"
@@ -34,7 +34,7 @@
 #include "set_even_to_zero.h"
 
 void set_even_to_zero(spinor * const P) {
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp parallel
   {
 #endif
@@ -42,7 +42,7 @@ void set_even_to_zero(spinor * const P) {
   int x, y, z, t, i, ix;
   spinor * p = NULL;
 
-#ifdef OMP
+#ifdef TM_USE_OMP
 #pragma omp for
 #endif
   for(x = 0; x < LX; x++) {
@@ -78,7 +78,7 @@ void set_even_to_zero(spinor * const P) {
     }
   }
 
-#ifdef OMP
+#ifdef TM_USE_OMP
   } /*OpenMP closing brace */
 #endif
 
diff --git a/linalg/square_and_minmax.c b/linalg/square_and_minmax.c
new file mode 100644
index 000000000..0b1b5a41d
--- /dev/null
+++ b/linalg/square_and_minmax.c
@@ -0,0 +1,424 @@
+/***********************************************************************
+ * copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * File square_and_max.c
+ *
+ *   void square_and_max(spinor * const P )
+ *     Returns the square norm and max local deviation of *P
+ *
+ *******************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#ifdef TM_USE_MPI
+# include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+# include <omp.h>
+# include "global.h"
+#endif
+#include <complex.h>
+#include "su3.h"
+#include "su3adj.h"
+#include "su3spinor.h"
+#include "square_and_minmax.h"
+
+void square_and_minmax(double * const sum, double * const min, double * const max, const spinor * const P, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,tr,ts,tt;
+  spinor *s;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+
+    ds=s->s0.c0 * conj(s->s0.c0) + s->s0.c1 * conj(s->s0.c1) + s->s0.c2 * conj(s->s0.c2) +  
+      s->s1.c0 * conj(s->s1.c0) + s->s1.c1 * conj(s->s1.c1) + s->s1.c2 * conj(s->s1.c2) +  
+      s->s2.c0 * conj(s->s2.c0) + s->s2.c1 * conj(s->s2.c1) + s->s2.c2 * conj(s->s2.c2) +
+      s->s3.c0 * conj(s->s3.c0) + s->s3.c1 * conj(s->s3.c1) + s->s3.c2 * conj(s->s3.c2);
+    
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_rel(double * const sum, double * const min, double * const max, const spinor * const P, const spinor * const Q, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dr,tr,ts,tt;
+  spinor *s, *r;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+    r=(spinor *) Q + ix;
+
+    ds=s->s0.c0 * conj(s->s0.c0) + s->s0.c1 * conj(s->s0.c1) + s->s0.c2 * conj(s->s0.c2) +  
+      s->s1.c0 * conj(s->s1.c0) + s->s1.c1 * conj(s->s1.c1) + s->s1.c2 * conj(s->s1.c2) +  
+      s->s2.c0 * conj(s->s2.c0) + s->s2.c1 * conj(s->s2.c1) + s->s2.c2 * conj(s->s2.c2) +
+      s->s3.c0 * conj(s->s3.c0) + s->s3.c1 * conj(s->s3.c1) + s->s3.c2 * conj(s->s3.c2);
+
+    dr=r->s0.c0 * conj(r->s0.c0) + r->s0.c1 * conj(r->s0.c1) + r->s0.c2 * conj(r->s0.c2) +  
+      r->s1.c0 * conj(r->s1.c0) + r->s1.c1 * conj(r->s1.c1) + r->s1.c2 * conj(r->s1.c2) +  
+      r->s2.c0 * conj(r->s2.c0) + r->s2.c1 * conj(r->s2.c1) + r->s2.c2 * conj(r->s2.c2) +
+      r->s3.c0 * conj(r->s3.c0) + r->s3.c1 * conj(r->s3.c1) + r->s3.c2 * conj(r->s3.c2);
+    
+    ds = ds/dr;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_abs(double * const sum, double * const min, double * const max,  double * const min_abs, double * const max_abs, const spinor * const P, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dds,tr,ts,tt;
+  spinor *s;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+  *max_abs = 0.0;
+  *min_abs = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+
+    dds=s->s0.c0 * conj(s->s0.c0);
+    ds=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c1 * conj(s->s0.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c2 * conj(s->s0.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c0 * conj(s->s1.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c1 * conj(s->s1.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c2 * conj(s->s1.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c0 * conj(s->s2.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c1 * conj(s->s2.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c2 * conj(s->s2.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c0 * conj(s->s3.c0);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c1 * conj(s->s3.c1);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c2 * conj(s->s3.c2);
+    ds+=dds;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min_abs = kc;
+
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max_abs = kc;
+
+#endif
+
+  return;
+}
+
+void square_and_minmax_rel_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P, const spinor * const Q, const int N)
+{
+  int ix;
+  double ALIGN ks,kc,ds,dds,dr,ddr,tr,ts,tt;
+  spinor *s, *r;
+  
+  ks=0.0;
+  kc=0.0;
+  *max = 0.0;
+  *min = -1;
+  *max_abs = 0.0;
+  *min_abs = -1;
+
+#if (defined BGL && defined XLC)
+  __alignx(16, S);
+  __alignx(16, R);
+#endif
+  
+  for (ix = 0; ix < N; ix++)
+  {
+    s=(spinor *) P + ix;
+    r=(spinor *) Q + ix;
+
+    dds=s->s0.c0 * conj(s->s0.c0);
+    ddr=r->s0.c0 * conj(r->s0.c0);
+    ds=dds;
+    dr=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c1 * conj(s->s0.c1);
+    ddr=r->s0.c1 * conj(r->s0.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s0.c2 * conj(s->s0.c2);
+    ddr=r->s0.c2 * conj(r->s0.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c0 * conj(s->s1.c0);
+    ddr=r->s1.c0 * conj(r->s1.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c1 * conj(s->s1.c1);
+    ddr=r->s1.c1 * conj(r->s1.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s1.c2 * conj(s->s1.c2);
+    ddr=r->s1.c2 * conj(r->s1.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c0 * conj(s->s2.c0);
+    ddr=r->s2.c0 * conj(r->s2.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c1 * conj(s->s2.c1);
+    ddr=r->s2.c1 * conj(r->s2.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s2.c2 * conj(s->s2.c2);
+    ddr=r->s2.c2 * conj(r->s2.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c0 * conj(s->s3.c0);
+    ddr=r->s3.c0 * conj(r->s3.c0);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c1 * conj(s->s3.c1);
+    ddr=r->s3.c1 * conj(r->s3.c1);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+
+    dds=s->s3.c2 * conj(s->s3.c2);
+    ddr=r->s3.c2 * conj(r->s3.c2);
+    ds+=dds;
+    dr+=ddr;
+    dds/=ddr;
+    if(dds > *max_abs) *max_abs = dds;
+    else if(dds < *min_abs || *min_abs < 0) *min_abs = dds;
+    
+    ds = ds/dr;
+
+    tr=ds + kc;
+    ts=tr + ks;
+    tt=ts-ks;
+    ks=ts;
+    kc=tr-tt;
+
+    if(ds > *max) *max = ds;
+    if(ds < *min || *min < 0) *min = ds;
+  }
+  kc=ks + kc;
+  *sum=kc;
+
+#if defined TM_USE_MPI
+
+  MPI_Allreduce(&kc, sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  MPI_Allreduce(min, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min = kc;
+
+  MPI_Allreduce(max, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max = kc;
+
+  MPI_Allreduce(min_abs, &kc, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+  *min_abs = kc;
+
+  MPI_Allreduce(max_abs, &kc, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  *max_abs = kc;
+
+#endif
+
+  return;
+}
diff --git a/linalg/square_and_minmax.h b/linalg/square_and_minmax.h
new file mode 100644
index 000000000..9f2f85f67
--- /dev/null
+++ b/linalg/square_and_minmax.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _SQUARE_AND_MAX_H
+#define _SQUARE_AND_MAX_H
+
+#include "su3.h"
+
+/* double square_and_minmax(spinor * const P )
+ *     Returns the square norm of *P and the local minimal/maximal norm */
+
+/* double square_and_minmax(spinor * const P, spinor * const Q )
+ *     Returns the square norm of *P/*Q (locally) and the local minimal/maximal norm */
+
+void square_and_minmax(double * const sum, double * const min, double * const max, const spinor * const P, const int N);
+void square_and_minmax_rel(double * const sum, double * const min, double * const max, const spinor * const P,  const spinor * const Q, const int N);
+void square_and_minmax_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P, const int N);
+void square_and_minmax_rel_abs(double * const sum, double * const min, double * const max, double * const min_abs, double * const max_abs, const spinor * const P,  const spinor * const Q, const int N);
+
+
+
+#endif
+
+
+
diff --git a/linalg_eo.h b/linalg_eo.h
index 020f0483a..2bba98c4f 100644
--- a/linalg_eo.h
+++ b/linalg_eo.h
@@ -30,6 +30,7 @@
 #include "linalg/scalar_prod_r_32.h"
 #include "linalg/scalar_prod_i.h"
 #include "linalg/square_and_prod_r.h"
+#include "linalg/square_and_minmax.h"
 #include "linalg/assign_add_mul_r.h"
 #include "linalg/assign_add_mul_r_32.h"
 #include "linalg/assign_mul_bra_add_mul_r.h"
@@ -66,7 +67,7 @@
 #include "linalg/mattimesvec.h"
 
 #include "linalg/convert_eo_to_lexic.h"
-
+#include "linalg/convert_even_to_lexic.h"
 #include "linalg/convert_odd_to_lexic.h"
 #include "linalg/set_even_to_zero.h"
 #include "linalg/mul_gamma5.h"
diff --git a/meas/correlators.c b/meas/correlators.c
index f01180263..1185594ba 100644
--- a/meas/correlators.c
+++ b/meas/correlators.c
@@ -240,8 +240,8 @@ void correlators_measurement(const int traj, const int id, const int ieo) {
 #else
       free(Cpp); free(Cpa); free(Cp4);
 #endif
-    }
-  } 
+    } // for(max_time_slices)
+  } // for(max_samples)
   etime = gettime();
   if(g_proc_id == 0 && g_debug_level > 0) {
     printf("ONLINE: measurement done int t/s = %1.4e\n", etime - atime);
diff --git a/monomial/cloverdetratio_rwmonomial.c b/monomial/cloverdetratio_rwmonomial.c
index 6653a047b..fc95575ef 100644
--- a/monomial/cloverdetratio_rwmonomial.c
+++ b/monomial/cloverdetratio_rwmonomial.c
@@ -45,7 +45,10 @@
 #include "monomial/monomial.h"
 #include "boundary.h"
 #include "cloverdetratio_rwmonomial.h"
-
+#include "expo.h"
+#include "xchange/xchange.h"
+#include "init/init_gauge_tmp.h"
+#include "DDalphaAMG_interface.h"
 
 double cloverdetratio_rwacc(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
diff --git a/monomial/monomial.c b/monomial/monomial.c
index ba206a934..ced50be24 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -144,7 +144,7 @@ int add_monomial(const int type) {
   monomial_list[no_monomials].rat.crange[1] = 11;
 
   monomial_list[no_monomials].initialised = 1;
-  if(monomial_list[no_monomials].type == NDDETRATIO || monomial_list[no_monomials].type == CLOVERDETRATIORW) {
+  if(monomial_list[no_monomials].type == NDDETRATIO || monomial_list[no_monomials].type == NDCLOVERDETRATIO || monomial_list[no_monomials].type == CLOVERDETRATIORW) {
     monomial_list[no_monomials].timescale = -5;
   }
 
@@ -159,10 +159,13 @@ int init_monomials(const int V, const int even_odd_flag) {
   spinor * __pf = NULL;
   double sw_mu=0., sw_k=0., sw_c=0.;
   double swn_mubar=0., swn_epsbar = 0., swn_k=0., swn_c=0.;
+
+  if (g_exposu3_no_c == 0) init_exposu3();
+  
   for(int i = 0; i < no_monomials; i++) {
     if((monomial_list[i].type != GAUGE) && (monomial_list[i].type != SFGAUGE)) no++;
     /* non-degenerate monomials need two pseudo fermion fields */
-    if((monomial_list[i].type == NDPOLY) || (monomial_list[i].type == NDDETRATIO) || 
+    if((monomial_list[i].type == NDPOLY) || (monomial_list[i].type == NDDETRATIO) || (monomial_list[i].type == NDCLOVERDETRATIO) || 
        (monomial_list[i].type == NDCLOVER) || (monomial_list[i].type == NDRAT)||
        (monomial_list[i].type == NDRATCOR) || (monomial_list[i].type == NDCLOVERRATCOR) ||
        (monomial_list[i].type == NDCLOVERRAT)) no++;
@@ -460,6 +463,17 @@ int init_monomials(const int V, const int even_odd_flag) {
 	  printf("# Initialised monomial of type NDDETRATIO, no_monomials= %d, currently only available for reweighting!\n", no_monomials);
 	}
       }
+      else if(monomial_list[i].type == NDCLOVERDETRATIO) {
+	monomial_list[i].hbfunction = &dummy_heatbath;
+	monomial_list[i].accfunction = &nddetratio_acc;
+	monomial_list[i].derivativefunction = NULL;
+	monomial_list[i].pf2 = __pf+no*V;
+	monomial_list[i].timescale = -5;
+	no++;
+	if(g_proc_id == 0 && g_debug_level > 1) {
+	  printf("# Initialised monomial of type NDCLOVERDETRATIO, no_monomials= %d, currently only available for reweighting!\n", no_monomials);
+	}
+      }
     }
     else {
       monomial_list[i].pf = NULL;
diff --git a/monomial/monomial.h b/monomial/monomial.h
index c2321956f..00c25a1c8 100644
--- a/monomial/monomial.h
+++ b/monomial/monomial.h
@@ -50,6 +50,7 @@
 #define CLOVERRAT 19
 #define CLOVERRATCOR 20
 #define CLOVERDETRATIORW 21
+#define NDCLOVERDETRATIO 22
 
 #define max_no_monomials 30
 
diff --git a/monomial/nddetratio_monomial.c b/monomial/nddetratio_monomial.c
index 81f96cfc1..773817599 100644
--- a/monomial/nddetratio_monomial.c
+++ b/monomial/nddetratio_monomial.c
@@ -38,6 +38,8 @@
 #include "operator/Hopping_Matrix.h"
 #include "phmc.h"
 #include "boundary.h"
+#include "operator/clovertm_operators.h"
+#include "operator/clover_leaf.h"
 #include "gamma.h"
 #include "operator/tm_operators_nd.h"
 #include "chebyshev_polynomial_nd.h"
@@ -47,6 +49,7 @@
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
 #include "nddetratio_monomial.h"
+#include "DDalphaAMG_interface.h"
 
 
 
@@ -54,24 +57,43 @@ double nddetratio_acc(const int id, hamiltonian_field_t * const hf) {
   int iter;
   monomial * mnl = &monomial_list[id];
   double atime, etime;
+  matrix_mult_nd Q_pm_ndpsi = Qtm_pm_ndpsi, Q_dagger_ndpsi = Qtm_dagger_ndpsi, Q_ndpsi = Qtm_ndpsi;
   atime = gettime();
   
   g_mubar = mnl->mubar;
   g_epsbar = mnl->epsbar;
   boundary(mnl->kappa);
 
-  iter = cg_her_nd(mnl->w_fields[0], mnl->w_fields[1], mnl->pf, mnl->pf2,
-		   mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
-		   VOLUME/2, &Qtm_pm_ndpsi);
-  Qtm_dagger_ndpsi(mnl->w_fields[2], mnl->w_fields[3],
-			mnl->w_fields[0], mnl->w_fields[1]);
+  if(mnl->type == NDCLOVERDETRATIO) {
+    Q_pm_ndpsi = Qsw_pm_ndpsi;
+    Q_dagger_ndpsi = Qsw_dagger_ndpsi;
+    Q_ndpsi = Qsw_ndpsi;
+    init_sw_fields();
+    sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); 
+    sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
+  }
+  if( mnl->solver == MG ) {
+    iter = MG_solver_nd(mnl->w_fields[2], mnl->w_fields[3], mnl->pf, mnl->pf2,
+                        mnl->accprec, mnl->maxiter, g_relative_precision_flag, 
+                        VOLUME/2, g_gauge_field, Q_ndpsi);
+  } else {
+    iter = cg_her_nd(mnl->w_fields[0], mnl->w_fields[1], mnl->pf, mnl->pf2,
+                     mnl->maxiter, mnl->accprec, g_relative_precision_flag, 
+                     VOLUME/2, Q_pm_ndpsi);
+    Q_dagger_ndpsi(mnl->w_fields[2], mnl->w_fields[3],
+                   mnl->w_fields[0], mnl->w_fields[1]);
+  }
 
   g_mubar = mnl->mubar2;
   g_epsbar = mnl->epsbar2;
   boundary(mnl->kappa2);
 
-  Qtm_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
-		  mnl->w_fields[2], mnl->w_fields[3]);
+  if(mnl->type == NDCLOVERDETRATIO) {
+    sw_term((const su3**) hf->gaugefield, mnl->kappa2, mnl->c_sw); 
+    sw_invert_nd(mnl->mubar2*mnl->mubar2 - mnl->epsbar2*mnl->epsbar2);
+  }
+  Q_ndpsi(mnl->w_fields[0], mnl->w_fields[1],
+            mnl->w_fields[2], mnl->w_fields[3]);
   
   mnl->energy1  = scalar_prod_r(mnl->pf , mnl->w_fields[0], VOLUME/2, 1);
   mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[1], VOLUME/2, 1);
diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index ddf76c7cf..d3a84299b 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -47,6 +47,7 @@
 #include "rational/rational.h"
 #include "phmc.h"
 #include "ndrat_monomial.h"
+#include "default_input_values.h"
 
 void nd_set_global_parameter(monomial * const mnl) {
 
@@ -57,7 +58,7 @@ void nd_set_global_parameter(monomial * const mnl) {
   boundary(g_kappa);
   phmc_cheb_evmin = mnl->EVMin;
   phmc_invmaxev = mnl->EVMaxInv;
-  phmc_cheb_evmax = 1.;
+  phmc_cheb_evmax = mnl->EVMax;
   phmc_Cpol = 1.;
   // used for preconditioning in cloverdetrat
   g_mu3 = 0.;
@@ -99,7 +100,6 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.shifts = mnl->rat.mu;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
   mnl->solver_params.type = mnl->solver; 
-
   mnl->solver_params.M_ndpsi = &Qtm_pm_ndpsi;
   mnl->solver_params.M_ndpsi32 = &Qtm_pm_ndpsi_32;    
   if(mnl->type == NDCLOVERRAT) {
@@ -107,10 +107,11 @@ void ndrat_derivative(const int id, hamiltonian_field_t * const hf) {
     mnl->solver_params.M_ndpsi32 = &Qsw_pm_ndpsi_32;
   }
   mnl->solver_params.sdim = VOLUME/2;
+
   // this generates all X_j,o (odd sites only) -> g_chi_up|dn_spinor_field
   mnl->iter1 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
                              mnl->pf, mnl->pf2, &(mnl->solver_params) );
-  
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     if(mnl->type == NDCLOVERRAT) {
       // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites)
@@ -229,28 +230,16 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  mnl->iter0 = solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-                            mnl->pf, mnl->pf2, &(mnl->solver_params) );
+  mnl->iter0 = solve_mms_nd_plus(g_chi_up_spinor_field, g_chi_dn_spinor_field,
+                                 mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
   assign(mnl->w_fields[2], mnl->pf, VOLUME/2);
   assign(mnl->w_fields[3], mnl->pf2, VOLUME/2);
 
   // apply C to the random field to generate pseudo-fermion fields
   for(int j = (mnl->rat.np-1); j > -1; j--) {
-    // Q_h * tau^1 - i nu_j
-    // this needs phmc_Cpol = 1 to work!
-    if(mnl->type == NDCLOVERRAT) {
-      Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
-			       g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			       I*mnl->rat.nu[j], 1., mnl->EVMaxInv);
-    }
-    else {
-      Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
-			     g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], 
-			     I*mnl->rat.nu[j], 1., mnl->EVMaxInv);
-    }
-    assign_add_mul(mnl->pf, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
-    assign_add_mul(mnl->pf2, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2);
+      assign_add_mul(mnl->pf, g_chi_up_spinor_field[j], I*mnl->rat.rnu[j], VOLUME/2);
+      assign_add_mul(mnl->pf2, g_chi_dn_spinor_field[j], I*mnl->rat.rnu[j], VOLUME/2);
   }
 
   etime = gettime();
@@ -293,7 +282,7 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
   mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field,
-                             mnl->pf, mnl->pf2, &(mnl->solver_params) );
+                            mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
   // apply R to the pseudo-fermion fields
   assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
@@ -322,26 +311,40 @@ double ndrat_acc(const int id, hamiltonian_field_t * const hf) {
 
 int init_ndrat_monomial(const int id) {
   monomial * mnl = &monomial_list[id];  
+  int scale = 0;
 
-  mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
-  mnl->EVMax = 1.;
-  mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax));
+  if(mnl->type == RAT || mnl->type == CLOVERRAT ||
+     mnl->type == RATCOR || mnl->type == CLOVERRATCOR) 
+    scale = 1;
+
+  if(scale) {
+    // When scale = 1 
+    //   the rational approximation is done for the standard operator 
+    //   which have eigenvalues between EVMin and EVMax.  Indeed the 
+    //   parameters of the rational approximation are scaled. Thus 
+    //   additional scaling of the operator (EVMaxInv) is not required.
+    mnl->EVMin = mnl->StildeMin;
+    mnl->EVMax = mnl->StildeMax;
+    mnl->EVMaxInv = 1.;
+  } else {
+    // When scale = 0 
+    //   the rational approximation is done for the normalized operator 
+    //   which have eigenvalues between EVMin/EVMax and 1. Thus the 
+    //   operator need to be scaled by EVMaxInv=1/EVMax.
+    mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
+    mnl->EVMax = 1.;
+    mnl->EVMaxInv = 1./sqrt(mnl->StildeMax);
+  }
+
+  init_rational(&mnl->rat, scale);
 
   if(mnl->type == RAT || mnl->type == CLOVERRAT ||
      mnl->type == RATCOR || mnl->type == CLOVERRATCOR) {
-    init_rational(&mnl->rat, 1);
-
     if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+2)/2) != 0) {
       fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n");
       exit(0);
     }
-  }
-  else {
-    init_rational(&mnl->rat, 0);
-    mnl->EVMin = mnl->StildeMin / mnl->StildeMax;
-    mnl->EVMax = 1.;
-    mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax));
-    
+  } else {
     if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+1)) != 0) {
       fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n");
       exit(0);
diff --git a/monomial/ndratcor_monomial.c b/monomial/ndratcor_monomial.c
index 481e4bb08..e0f7aaf55 100644
--- a/monomial/ndratcor_monomial.c
+++ b/monomial/ndratcor_monomial.c
@@ -81,7 +81,7 @@ void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   }
   // we measure before the trajectory!
   if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
-    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
+    if(mnl->type != NDCLOVERRATCOR) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
     else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
   }
 
@@ -233,7 +233,6 @@ double ndratcor_acc(const int id, hamiltonian_field_t * const hf) {
     up1 = tup; dn1 = tdn;
   }
 
-
   etime = gettime();
   if(g_proc_id == 0) {
     if(g_debug_level > 1) {
diff --git a/monomial/rat_monomial.c b/monomial/rat_monomial.c
index 6d419be57..c9d807204 100644
--- a/monomial/rat_monomial.c
+++ b/monomial/rat_monomial.c
@@ -55,7 +55,7 @@
 
 void rat_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
   mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_kappa = mnl->kappa;
@@ -85,12 +85,12 @@ void rat_derivative(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.no_shifts = mnl->rat.np;
   mnl->solver_params.shifts = mnl->rat.mu;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  mnl->solver_params.type = CGMMS;
+  mnl->solver_params.type = mnl->solver;
   mnl->solver_params.M_psi = mnl->Qsq;
   mnl->solver_params.sdim = VOLUME/2;
   // this generates all X_j,o (odd sites only) -> g_chi_up_spinor_field
-  mnl->iter1 += solve_mshift_oneflavour(g_chi_up_spinor_field, mnl->pf,
-                                        &(mnl->solver_params) );
+  mnl->iter1 += solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                             &(mnl->solver_params) );
   
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     mnl->Qp(mnl->w_fields[0], g_chi_up_spinor_field[j]);
@@ -148,7 +148,7 @@ void rat_derivative(const int id, hamiltonian_field_t * const hf) {
 
 void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
   mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_kappa = mnl->kappa;
@@ -181,12 +181,13 @@ void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.squared_solver_prec = mnl->accprec;
   mnl->solver_params.no_shifts = mnl->rat.np;
   mnl->solver_params.shifts = mnl->rat.nu;
-  mnl->solver_params.type = CGMMS;
+  mnl->solver_params.type = mnl->solver;
   mnl->solver_params.M_psi = mnl->Qsq;
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  mnl->iter0 = solve_mshift_oneflavour(g_chi_up_spinor_field, mnl->pf,
-			               &(mnl->solver_params) );
+
+  mnl->iter0 = solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                            &(mnl->solver_params) );
 
   assign(mnl->w_fields[2], mnl->pf, VOLUME/2);
 
@@ -214,7 +215,7 @@ void rat_heatbath(const int id, hamiltonian_field_t * const hf) {
 
 double rat_acc(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
-  double atime, etime, dummy;
+  double atime, etime;
   atime = gettime();
   mnl_backup_restore_globals(TM_BACKUP_GLOBALS);
   g_kappa = mnl->kappa;
@@ -233,12 +234,12 @@ double rat_acc(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.squared_solver_prec = mnl->accprec;
   mnl->solver_params.no_shifts = mnl->rat.np;
   mnl->solver_params.shifts = mnl->rat.mu;
-  mnl->solver_params.type = CGMMS;
+  mnl->solver_params.type = mnl->solver;
   mnl->solver_params.M_psi = mnl->Qsq;
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  mnl->iter0 += solve_mshift_oneflavour(g_chi_up_spinor_field, mnl->pf,
-			                &(mnl->solver_params) );
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, mnl->pf,
+                             &(mnl->solver_params) );
 
   // apply R to the pseudo-fermion fields
   assign(mnl->w_fields[0], mnl->pf, VOLUME/2);
diff --git a/monomial/ratcor_monomial.c b/monomial/ratcor_monomial.c
index b29f11f11..fb725e877 100644
--- a/monomial/ratcor_monomial.c
+++ b/monomial/ratcor_monomial.c
@@ -61,7 +61,8 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
   double atime, etime, delta;
   spinor * up0, * up1, * tup;
-  double coefs[6] = {1./4., -3./32., 7./128., -77./2048., 231./8192., -1463./65536.};
+  double coefs[6] = {1./4., -3./32., 7./128., -77./2048., 231./8192., -1463./65536.}; // series of (1+x)^(1/4)
+  double coefs_check[6] = {1./2., -1./8., 1./16., -5./128., 7./256., -21./1024.}; // series of (1+x)^(1/2)
   atime = gettime();
   nd_set_global_parameter(mnl);
   g_mu = 0.;
@@ -90,7 +91,7 @@ void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) {
   mnl->solver_params.squared_solver_prec = mnl->accprec;
   mnl->solver_params.no_shifts = mnl->rat.np;
   mnl->solver_params.shifts = mnl->rat.mu;
-  mnl->solver_params.type = CGMMS;
+  mnl->solver_params.type = mnl->solver;
   mnl->solver_params.M_psi = mnl->Qsq;
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
@@ -185,8 +186,8 @@ double apply_Z_psi(spinor * const k_up,	spinor * const l_up,
 		     solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
 
-  mnl->iter0 += solve_mshift_oneflavour(g_chi_up_spinor_field, l_up,
-                                        solver_params);  
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, l_up,
+                             solver_params);  
   
   // apply R to the pseudo-fermion fields
   assign(k_up, l_up, VOLUME/2);
@@ -196,8 +197,9 @@ double apply_Z_psi(spinor * const k_up,	spinor * const l_up,
   }
 
   // apply R a second time
-  solve_mshift_oneflavour(g_chi_up_spinor_field, k_up,
-                          solver_params);
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+                             solver_params);
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
@@ -223,7 +225,8 @@ void check_C_psi(spinor * const k_up, spinor * const l_up,
 		 const int id, hamiltonian_field_t * const hf,
 		 solver_params_t * solver_params) {
   monomial * mnl = &monomial_list[id];
-  mnl->iter0 = solve_mshift_oneflavour(g_chi_up_spinor_field, l_up, solver_params);
+
+  mnl->iter0 = solve_mms_tm(g_chi_up_spinor_field, l_up, solver_params);
 
   assign(k_up, l_up, VOLUME/2);
 
@@ -242,15 +245,17 @@ void check_C_psi(spinor * const k_up, spinor * const l_up,
   }
   //apply R
   solver_params->shifts = mnl->rat.mu;
-  solve_mshift_oneflavour(g_chi_up_spinor_field, k_up,
-                          solver_params);
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+                             solver_params);
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     assign_add_mul_r(k_up, g_chi_up_spinor_field[j], 
 		     mnl->rat.rmu[j], VOLUME/2);
   }
   // apply C^dagger
   solver_params->shifts = mnl->rat.nu;
-  solve_mshift_oneflavour(g_chi_up_spinor_field, k_up, solver_params);
+  mnl->iter0 += solve_mms_tm(g_chi_up_spinor_field, k_up,
+	    solver_params);
+
   for(int j = (mnl->rat.np-1); j > -1; j--) {
     if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) {
       //Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np],
diff --git a/operator.c b/operator.c
index eebaccd48..7a7196727 100644
--- a/operator.c
+++ b/operator.c
@@ -421,6 +421,10 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
                                               optr->solver, optr->rel_prec,
                                               optr->solver_params, optr->external_inverter, 
                                               optr->sloppy_precision, optr->compression_type);
+        // checking solution
+        M_full_ndpsi( g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2],
+                      g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4],
+                      optr->prop0, optr->prop1, optr->prop2, optr->prop3 );
       }
       else {
         optr->iterations = invert_cloverdoublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3,
@@ -429,27 +433,12 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
                                                     optr->solver, optr->rel_prec,
                                                     optr->solver_params, optr->external_inverter, 
                                                     optr->sloppy_precision, optr->compression_type);
+        // checking solution
+        Msw_full_ndpsi( g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2],
+                        g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4],
+                        optr->prop0, optr->prop1, optr->prop2, optr->prop3 );
       }
-      g_mu = optr->mubar;
-      if(optr->type != DBCLOVER) {
-        M_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1);
-      }
-      else {
-        Msw_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1);
-      }
-      assign_add_mul_r(g_spinor_field[DUM_DERI+1], optr->prop2, -optr->epsbar, VOLUME/2);
-      assign_add_mul_r(g_spinor_field[DUM_DERI+2], optr->prop3, -optr->epsbar, VOLUME/2);
-    
-      g_mu = -g_mu;
-      if(optr->type != DBCLOVER) {
-        M_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3);
-      }
-      else {
-        Msw_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3);
-      }
-      assign_add_mul_r(g_spinor_field[DUM_DERI+3], optr->prop0, -optr->epsbar, VOLUME/2);
-      assign_add_mul_r(g_spinor_field[DUM_DERI+4], optr->prop1, -optr->epsbar, VOLUME/2);
-
+ 
       diff(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], optr->sr0, VOLUME/2); 
       diff(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+2], optr->sr1, VOLUME/2); 
       diff(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+3], optr->sr2, VOLUME/2); 
diff --git a/operator/tm_operators_nd.c b/operator/tm_operators_nd.c
index 3f437a17e..73ba9132e 100644
--- a/operator/tm_operators_nd.c
+++ b/operator/tm_operators_nd.c
@@ -37,6 +37,7 @@
 #include "phmc.h"
 #include "gamma.h"
 #include "linalg_eo.h"
+#include "operator/D_psi.h"
 #include "operator/tm_operators.h"
 #include "operator/clovertm_operators.h"
 #include "operator/tm_operators_nd.h"
@@ -52,6 +53,74 @@ void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c,
 
 /* external functions */
 
+
+/******************************************
+ *
+ * This is the implementation of
+ *
+ *  M_full_ndpsi = D_w I_f + i gamma5 mubar tau3 - epsbar tau1
+ *  the full operator done for testing purpose
+ ******************************************/
+void M_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                  spinor * const Even_new_c, spinor * const Odd_new_c, 
+                  spinor * const Even_s, spinor * const Odd_s,
+                  spinor * const Even_c, spinor * const Odd_c) {
+  
+  double mu = g_mu;
+  g_mu = g_mubar;
+  M_full(Even_new_s, Odd_new_s, Even_s, Odd_s);
+
+  assign_add_mul_r(Even_new_s, Even_c, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_s, Odd_c, -g_epsbar, VOLUME/2);
+  
+  g_mu = -g_mu;
+  M_full(Even_new_c, Odd_new_c, Even_c, Odd_c);
+  
+  assign_add_mul_r(Even_new_c, Even_s, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_c, Odd_s, -g_epsbar, VOLUME/2);
+
+  g_mu = mu;
+}
+
+void Msw_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                    spinor * const Even_new_c, spinor * const Odd_new_c, 
+                    spinor * const Even_s, spinor * const Odd_s,
+                    spinor * const Even_c, spinor * const Odd_c) {
+
+  double mu = g_mu;
+  g_mu = g_mubar;
+  Msw_full(Even_new_s, Odd_new_s, Even_s, Odd_s);
+
+  assign_add_mul_r(Even_new_s, Even_c, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_s, Odd_c, -g_epsbar, VOLUME/2);
+  
+  g_mu = -g_mu;
+  Msw_full(Even_new_c, Odd_new_c, Even_c, Odd_c);
+  
+  assign_add_mul_r(Even_new_c, Even_s, -g_epsbar, VOLUME/2);
+  assign_add_mul_r(Odd_new_c, Odd_s, -g_epsbar, VOLUME/2);
+
+  g_mu = mu;
+}
+
+// full VOLUME operator; it used D_psi which works with tm and tm+clover
+void D_ndpsi(spinor * const l_strange, spinor * const l_charm,
+             spinor * const k_strange, spinor * const k_charm) {
+
+  double mu = g_mu;
+  g_mu = g_mubar;
+  D_psi(l_strange,k_strange);
+
+  assign_add_mul_r(l_strange, k_charm, -g_epsbar, VOLUME);
+  
+  g_mu = -g_mu;
+  D_psi(l_charm,k_charm);
+  
+  assign_add_mul_r(l_charm, k_strange, -g_epsbar, VOLUME);
+
+  g_mu = mu;
+}
+
 /******************************************
  *
  * This is the implementation of
@@ -110,6 +179,63 @@ void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+/******************************************
+ *
+ * This is the implementation of 
+ *
+ *  Q_tau1_ndpsi_add/sub_Ishift =  ( M +/- I z_k )
+ *
+ *  with M = Qhat(2x2) tau_1   and z_k from sqrt(g_shift) 
+ *
+ *
+ *  needed in the evaluation of the heatbath when 
+ *  the Rational approximation is used
+ *
+ *
+ * For details, see documentation and comments of the
+ * above mentioned routines
+ *
+ * k_charm and k_strange are the input fields
+ * l_* the output fields
+ *
+ * it acts only on the odd part or only
+ * on a half spinor
+ ******************************************/
+
+
+void Qtm_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Q_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm,-I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qtm_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Q_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm, I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qsw_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Qsw_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm,-I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+void Qsw_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange, spinor * const k_charm) {
+
+  Qsw_tau1_sub_const_ndpsi(l_strange,l_charm,k_strange,k_charm, I*sqrt(g_shift),1.,phmc_invmaxev);
+
+  return;
+}
+
+
 /******************************************
  *
  * This is the implementation of
@@ -237,6 +363,14 @@ void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+void Qtm_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                       spinor * const k_strange, spinor * const k_charm) {
+  Qtm_pm_ndpsi(l_strange,l_charm,k_strange,k_charm);  
+  assign_add_mul_r( l_strange, k_strange, g_shift, VOLUME/2 );
+  assign_add_mul_r( l_charm, k_charm, g_shift, VOLUME/2 );
+  return;
+}
+
 void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
 		  spinor * const k_strange, spinor * const k_charm) {
 
@@ -284,6 +418,15 @@ void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
   return;
 }
 
+void Qsw_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                       spinor * const k_strange, spinor * const k_charm) {
+  Qsw_pm_ndpsi(l_strange,l_charm,k_strange,k_charm);
+  
+  assign_add_mul_r( l_strange, k_strange, g_shift, VOLUME/2 );
+  assign_add_mul_r( l_charm, k_charm, g_shift, VOLUME/2 );
+
+  return;
+}
 
 
 /******************************************
diff --git a/operator/tm_operators_nd.h b/operator/tm_operators_nd.h
index 347f326a5..138e9b93b 100644
--- a/operator/tm_operators_nd.h
+++ b/operator/tm_operators_nd.h
@@ -23,53 +23,82 @@
 #define _TM_OPERATTORS_ND_H
 
 void mul_one_pm_itau2(spinor * const p, spinor * const q,
-		      spinor * const r, spinor * const s,
-		      const double sign, const int N);
+                      spinor * const r, spinor * const s,
+                      const double sign, const int N);
+
+void M_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                  spinor * const Even_new_c, spinor * const Odd_new_c, 
+                  spinor * const Even_s, spinor * const Odd_s,
+                  spinor * const Even_c, spinor * const Odd_c);
+
+void Msw_full_ndpsi(spinor * const Even_new_s, spinor * const Odd_new_s, 
+                    spinor * const Even_new_c, spinor * const Odd_new_c, 
+                    spinor * const Even_s, spinor * const Odd_s,
+                    spinor * const Even_c, spinor * const Odd_c);
+
+//This works with tm and tm+clover 
+void D_ndpsi(spinor * const l_strange, spinor * const l_charm,
+             spinor * const k_strange,  spinor * const k_charm);
 
 void Qtm_ndpsi(spinor * const l_strange, spinor * const l_charm,
-	       spinor * const k_strange,  spinor * const k_charm);
+               spinor * const k_strange,  spinor * const k_charm);
 void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm,
-	       spinor * const k_strange, spinor * const k_charm);
+               spinor * const k_strange, spinor * const k_charm);
+
+void Qtm_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qtm_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qsw_tau1_ndpsi_add_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+void Qsw_tau1_ndpsi_sub_Ishift(spinor * const l_strange, spinor * const l_charm,
+                               spinor * const k_strange,  spinor * const k_charm);
+
 
 void Qtm_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 void Qsw_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
                   spinor * const k_strange, spinor * const k_charm);
+void Qtm_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                        spinor * const k_strange, spinor * const k_charm);
+
 void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		  spinor * const k_strange, spinor * const k_charm);
+                  spinor * const k_strange, spinor * const k_charm);
+void Qsw_pm_ndpsi_shift(spinor * const l_strange, spinor * const l_charm,
+                        spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k);
 void Qsw_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k);
 
 void Q_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
-			    spinor * const k_strange, spinor * const k_charm, 
-			    const _Complex double z, const double Cpol, const double invev);
+                            spinor * const k_strange, spinor * const k_charm, 
+                            const _Complex double z, const double Cpol, const double invev);
 void Qsw_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm,
-			      spinor * const k_strange, spinor * const k_charm, 
-			      const _Complex double z, const double Cpol, const double invev);
+                              spinor * const k_strange, spinor * const k_charm, 
+                              const _Complex double z, const double Cpol, const double invev);
 
 void H_eo_tm_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-             spinor * const k_strange, spinor * const k_charm, 
-	     const int ieo);
+                   spinor * const k_strange, spinor * const k_charm, 
+                   const int ieo);
 void H_eo_sw_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		   spinor * const k_strange, spinor * const k_charm);
+                   spinor * const k_strange, spinor * const k_charm);
 
 
 void M_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		    spinor * const k_strange, spinor * const k_charm,
-		    const double mu, const double eps);
+                    spinor * const k_strange, spinor * const k_charm,
+                    const double mu, const double eps);
 
 void Msw_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, 
-		      spinor * const k_strange, spinor * const k_charm);
+                      spinor * const k_strange, spinor * const k_charm);
 
 void Q_test_epsilon(spinor * const l_strange, spinor * const l_charm,
                     spinor * const k_strange, spinor * const k_charm);
 
 void Qtau1_P_ndpsi(spinor * const l_strange, spinor * const l_charm,
-		spinor * const k_strange, spinor * const k_charm);
+                   spinor * const k_strange, spinor * const k_charm);
 
 void Qtm_pm_Ptm_pm_psi(spinor * const l, spinor * const k);
 
diff --git a/operator/tm_operators_nd_32.c b/operator/tm_operators_nd_32.c
index a0cdebb5c..54224b8c7 100644
--- a/operator/tm_operators_nd_32.c
+++ b/operator/tm_operators_nd_32.c
@@ -262,6 +262,14 @@ void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
   return;
 }
 
+void Qtm_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm,
+          spinor32 * const k_strange, spinor32 * const k_charm){
+  Qtm_pm_ndpsi_32(l_strange, l_charm, k_strange, k_charm);
+  assign_add_mul_r_32(l_strange, k_strange, (float)g_shift, VOLUME/2 );
+  assign_add_mul_r_32(l_charm, k_charm, (float)g_shift, VOLUME/2 );
+  return;
+}
+
 void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
       spinor32 * const k_strange, spinor32 * const k_charm) {
 #ifdef TM_USE_OMP
@@ -316,3 +324,13 @@ void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
 
   return;
 }
+
+void Qsw_pm_ndpsi_shift_32(spinor32* const l_strange, spinor32 * const l_charm,
+      spinor32 * const k_strange, spinor32 * const k_charm){
+  Qsw_pm_ndpsi_32(l_strange,l_charm,k_strange,k_charm);
+  assign_add_mul_r_32(l_strange, k_strange, (float)g_shift, VOLUME/2 );
+  assign_add_mul_r_32(l_charm, k_charm, (float)g_shift, VOLUME/2 );
+  return;
+}
+
+
diff --git a/operator/tm_operators_nd_32.h b/operator/tm_operators_nd_32.h
index fedc818f7..c9833bed6 100644
--- a/operator/tm_operators_nd_32.h
+++ b/operator/tm_operators_nd_32.h
@@ -25,6 +25,9 @@ void Q_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor3
 
 void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
 		  spinor32 * const k_strange, spinor32 * const k_charm);
+void Qtm_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm);
+
 void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm,
       spinor32 * const k_strange, spinor32 * const k_charm);
+void Qsw_pm_ndpsi_shift_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm);
 #endif
diff --git a/phmc.c b/phmc.c
index d3a46d691..880c9f11c 100644
--- a/phmc.c
+++ b/phmc.c
@@ -235,7 +235,7 @@ void phmc_compute_ev(const int trajectory_counter,
 	   mnl->name, trajectory_counter, temp2);
   }
   if(g_proc_id == 0) {
-    if(temp2 > 1.) {
+    if(temp2 > mnl->EVMax) {
       fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s larger than upper bound!\n\n", mnl->name);
     }
     if(temp < mnl->EVMin) {
@@ -243,7 +243,7 @@ void phmc_compute_ev(const int trajectory_counter,
     }
     countfile = fopen(phmcfilename, "a");
     fprintf(countfile, "%.8d %1.5e %1.5e %1.5e %1.5e\n", 
-	    trajectory_counter, temp, temp2, mnl->EVMin, 1.);
+	    trajectory_counter, temp, temp2, mnl->EVMin, mnl->EVMax);
     fclose(countfile);
   }
   etime = gettime();
diff --git a/read_input.h b/read_input.h
index 7a7756001..ce9fd9ccb 100644
--- a/read_input.h
+++ b/read_input.h
@@ -88,6 +88,7 @@ extern "C"
   extern int bc_flag;
   extern int online_measurement_flag;
   extern int online_measurement_freq;
+  extern int restoresu3_flag;
   extern int reweighting_flag;
   extern int reweighting_samples; 
   extern int no_samples;
@@ -129,6 +130,8 @@ extern "C"
   extern int mg_blk[4];
   extern int mg_mixed_prec;
   extern int mg_setup_mu_set;
+  extern int mg_no_shifts;
+  extern double mg_mms_mass;
   extern double mg_setup_mu;
   extern double mg_cmu_factor;
   extern double mg_dtau_update;
diff --git a/read_input.l b/read_input.l
index 16c34fc2e..36e3b5ff3 100644
--- a/read_input.l
+++ b/read_input.l
@@ -385,14 +385,16 @@ static inline double fltlist_next_token(int * const list_end){
 %x CLRATCORMONOMIAL
 %x NDCLRATMONOMIAL
 %x NDRATCORMONOMIAL
+%x NDDETRATMONOMIAL
+%x NDCLDETRATMONOMIAL
 %x NDCLRATCORMONOMIAL
 %x POLYMONOMIAL
 %x CLPOLYMONOMIAL
 %x MNAME
 %x MCSTR
 %x MSOLVER
-%x NDMSOLVER
 %x RATMSOLVER
+%x NDMSOLVER
 %x GTYPE
 
 %x COMMENT
@@ -800,6 +802,20 @@ static inline double fltlist_next_token(int * const list_end){
     mg_omp_num_threads=a;
     if(myverbose) printf("  MG_omp_num_threads set to %d line %d operator %d\n", mg_omp_num_threads, line_of_file, current_operator);
   }
+  {SPC}*MGNumberOfShifts{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mg_no_shifts=a;
+    // when the number of shifts is specified, mg_mss_mass must be set to zero!
+    mg_mms_mass=0;
+    if(myverbose) printf("  MG_Num_of_shifts set to %d line %d operator %d\n", mg_no_shifts, line_of_file, current_operator);
+  }
+  {SPC}*MGMMSMass{EQL}{FLT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mg_mms_mass=c;
+    // when mg_mms_mass is specified, mg_no_shifts should be set to zero!
+    mg_no_shifts=0;
+    if(myverbose) printf("  MG_MMS_Mass set to %f line %d operator %d\n", mg_mms_mass, line_of_file, current_operator);
+  }
   EndDDalphaAMG{SPC}* {
   if(myverbose) printf("DDalphaAMG parsed in line %d\n\n", line_of_file);
   BEGIN(0);
@@ -1198,10 +1214,15 @@ static inline double fltlist_next_token(int * const list_end){
     if(myverbose) printf("  Solver set to RGMixedCG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
   }
-  MG {
+  DDalphaAMG {
+#ifdef DDalphaAMG
     optr->solver = MG;
-    if(myverbose) printf("  Solver set to MG line %d operator %d\n", line_of_file, current_operator);
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
   }
   dummyhermtest {
     optr->solver=DUMMYHERMTEST;
@@ -1344,9 +1365,14 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
+#ifdef DDalphaAMG
     optr->solver = MG;
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     BEGIN(name_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
   }
 }
 
@@ -1612,6 +1638,11 @@ static inline double fltlist_next_token(int * const list_end){
     strcpy((*mnl).name, "NDDETRATIO");
     g_running_phmc = 1;
   }
+  else if(strcmp(yytext, "NDCLOVERDETRATIO")==0) {
+    mnl->type = NDCLOVERDETRATIO;
+    strcpy((*mnl).name, "NDDCLOVERETRATIO");
+    g_running_phmc = 1;
+  }
   else if(strcmp(yytext, "NDPOLY")==0) {
     mnl->type = NDPOLY;
     strcpy((*mnl).name, "NDPOLY");
@@ -1694,10 +1725,13 @@ static inline double fltlist_next_token(int * const list_end){
   else if(mnl->type == NDRAT) BEGIN(NDRATMONOMIAL);
   else if(mnl->type == RAT) BEGIN(RATMONOMIAL);
   else if(mnl->type == NDCLOVERRAT) BEGIN(NDCLRATMONOMIAL);
+  else if(mnl->type == NDDETRATIO) BEGIN(NDDETRATMONOMIAL);
   else if(mnl->type == CLOVERRAT) BEGIN(CLRATMONOMIAL);
   else if(mnl->type == NDRATCOR) BEGIN(NDRATCORMONOMIAL);
   else if(mnl->type == RATCOR) BEGIN(RATCORMONOMIAL);
   else if(mnl->type == NDCLOVERRATCOR) BEGIN(NDCLRATCORMONOMIAL);
+  else if(mnl->type == NDDETRATIO) BEGIN(NDDETRATMONOMIAL);
+  else if(mnl->type == NDCLOVERDETRATIO) BEGIN(NDCLDETRATMONOMIAL);
   else if(mnl->type == CLOVERRATCOR) BEGIN(CLRATCORMONOMIAL);
   else if(mnl->type == POLY || mnl->type == POLYDETRATIO)  {
           fprintf(stderr,"starting to parse poly(detratio) monomial\n");
@@ -1711,9 +1745,9 @@ static inline double fltlist_next_token(int * const list_end){
 
 
 
-<DETMONOMIAL,GAUGEMONOMIAL,NDPOLYMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL>{
+<DETMONOMIAL,GAUGEMONOMIAL,NDPOLYMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*Timescale{EQL}{DIGIT}+ {
-    if(mnl->type == NDDETRATIO) {
+    if(mnl->type == NDDETRATIO || mnl->type == NDCLOVERDETRATIO) {
       mnl->timescale = -5;
       if(myverbose) printf("  timescales set to %d line %d monomial %d since NDDETRATIO is not for MD evolution\n", a, line_of_file, current_monomial);
     }
@@ -1733,7 +1767,7 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+<CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,CLPOLYMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*CSW{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->c_sw = c;
@@ -1757,7 +1791,7 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*Kappa{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->kappa = c;
@@ -1778,6 +1812,14 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
+<NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*Kappa2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->kappa2 = c;
+    if(myverbose) printf("  kappa2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+}
+
 <NDCLRATMONOMIAL,CLRATMONOMIAL>{
   {SPC}*AddTrLog{EQL}yes {
     mnl->trlog = 1;
@@ -1789,7 +1831,7 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL>{
+<NDPOLYMONOMIAL,CLPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*2KappaMubar{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     mnl->mubar = c;
@@ -1802,7 +1844,19 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+<NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
+  {SPC}*2KappaMubar2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->mubar2 = c;
+    if(myverbose) printf("  2KappaMubar2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+  {SPC}*2KappaEpsbar2{EQL}{FLT} {
+    sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
+    mnl->epsbar2 = c;
+    if(myverbose) printf("  2KappaEpsbar2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+}
+<DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*UseExternalInverter{EQL}quda {
     if(myverbose) printf("  Use Quda inverter line %d monomial %d\n", line_of_file, current_monomial);
     mnl->solver_params.external_inverter = QUDA_INVERTER;
@@ -1853,7 +1907,7 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*ForcePrecision{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf",name , &c);
     mnl->forceprec = c;
@@ -1876,21 +1930,21 @@ static inline double fltlist_next_token(int * const list_end){
   }
 }
 
-<NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL>{
+<NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{
   {SPC}*Solver{EQL} {
    solver_caller=YY_START;
    BEGIN(NDMSOLVER);
   }
 }
 
-<RATMONOMIAL,CLRATMONOMIAL,RATCORMONOMIAL,CLRATCORMONOMIAL>{
+<RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
   {SPC}*Solver{EQL} {
-    solver_caller=YY_START;
-    BEGIN(RATMSOLVER);
+   solver_caller=YY_START;
+   BEGIN(RATMSOLVER);
   }
 }
 
-<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL>{
+<DETMONOMIAL,POLYMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL>{
   {SPC}*2KappaMu{EQL}{FLT} {
     sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c);
     mnl->mu = c;
@@ -2112,9 +2166,37 @@ static inline double fltlist_next_token(int * const list_end){
     BEGIN(name_caller);
   }
   DDalphaAMG {
+#ifdef DDalphaAMG
     if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
     mnl->solver = MG;
     BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
+  }
+}
+
+<RATMSOLVER>{
+  rgmixedCG {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = RGMIXEDCG;
+    BEGIN(solver_caller);
+  }
+  cgmms {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = CGMMS;
+    BEGIN(solver_caller);
+  }
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    mnl->solver = MG;
+    BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
   }
 }
 
@@ -2124,6 +2206,11 @@ static inline double fltlist_next_token(int * const list_end){
     mnl->solver = CGMMSND;
     BEGIN(solver_caller);
   }
+  rgmixedcg {
+    if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
+    mnl->solver = RGMIXEDCG;
+    BEGIN(solver_caller);
+  }
   mixedCGmmsnd {
     if(myverbose) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
     mnl->solver = MIXEDCGMMSND;
@@ -2134,13 +2221,15 @@ static inline double fltlist_next_token(int * const list_end){
     mnl->solver = 14;
     BEGIN(solver_caller);
   }
-}
-
-<RATMSOLVER>{
-  cgmms {
-    if( myverbose ) printf("  Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial);
-    mnl->solver = CGMMS;
+  DDalphaAMG {
+#ifdef DDalphaAMG
+    if(myverbose) printf("  Solver set to DDalphaAMG line %d operator %d\n", line_of_file, current_operator);
+    mnl->solver = MG;
     BEGIN(solver_caller);
+#else
+    printf("ERROR line %d operator %d: DDalphaAMG library not included\n", line_of_file, current_operator);
+    exit(1);
+#endif
   }
 }
 
@@ -2908,15 +2997,15 @@ static inline double fltlist_next_token(int * const list_end){
 }
 <REWEIGH>yes {
   reweighting_flag = 1;
-  if(myverbose!=0) fprintf(stderr, "Compute reweighting factor\n");
+  if(myverbose!=0) printf("Compute reweighting factor\n");
 }
 <REWEIGH>no {
   reweighting_flag = 0;
-  if(myverbose!=0) fprintf(stderr, "Do not compute reweighting factor\n");
+  if(myverbose!=0) printf("Do not compute reweighting factor\n");
 }
 <REWSAMPLES>{DIGIT}+ {
   reweighting_samples = atoi(yytext);
-  if(myverbose!=0) fprintf(stderr, "Number of reweighting samples set to %d\n", reweighting_samples);
+  if(myverbose!=0) printf("Number of reweighting samples set to %d\n", reweighting_samples);
 }
 
 <MIXCGIT>{DIGIT}+ {
@@ -2941,7 +3030,7 @@ static inline double fltlist_next_token(int * const list_end){
   BEGIN(comment_caller);
 }
 
-<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXTERNALINVERTER,QUDAINVERTER,QPHIXINVERTER>{SPC}*\n   {
+<INITMONOMIAL,DETMONOMIAL,CLDETMONOMIAL,CLDETRATMONOMIAL,CLDETRATRWMONOMIAL,NDPOLYMONOMIAL,NDRATMONOMIAL,NDRATCORMONOMIAL,NDCLRATMONOMIAL,NDCLRATCORMONOMIAL,CLPOLYMONOMIAL,GAUGEMONOMIAL,INTEGRATOR,INITINTEGRATOR,INITMEASUREMENT,PIONNORMMEAS,ONLINEMEAS,ORIENTEDPLAQUETTESMEAS,GRADIENTFLOWMEAS,INITOPERATOR,TMOP,DBTMOP,OVERLAPOP,WILSONOP,CLOVEROP,DBCLOVEROP,POLYMONOMIAL,PLOOP,INITGPU,GPU,RATMONOMIAL,RATCORMONOMIAL,CLRATMONOMIAL,CLRATCORMONOMIAL,INITDEFLATION,DEFLATION,INITMULTIGRID,MULTIGRID,INITEXTERNALINVERTER,QUDAINVERTER,QPHIXINVERTER,NDDETRATMONOMIAL,NDCLDETRATMONOMIAL>{SPC}*\n   {
   line_of_file++;
 }
 <*>{SPC}*\n                       {
@@ -3019,6 +3108,7 @@ int read_input(char * conf_file){
   g_mu1 = _default_g_mu1;
   g_mu2 = _default_g_mu2;
   g_mu3 = _default_g_mu3;
+  g_shift = _default_g_shift;
   g_dbw2rand = 0;
   g_running_phmc = 0;
   g_beta = _default_g_beta;
diff --git a/reweighting_factor.c b/reweighting_factor.c
index fe044485c..7c6a2afca 100644
--- a/reweighting_factor.c
+++ b/reweighting_factor.c
@@ -54,24 +54,32 @@ void reweighting_factor(const int N, const int nstore) {
     mnl = &monomial_list[j];
     if(mnl->even_odd_flag) {
       init_sw_fields();
-      double c_sw = mnl->c_sw;
-      if(c_sw < 0.) c_sw = 0.;
 
-      sw_term( (const su3**) hf.gaugefield, mnl->kappa, c_sw); 
-      if(mnl->type != NDDETRATIO) {
-        trlog[j] = -sw_trace(0, mnl->mu);
-      }
-      else {
-        trlog[j] = -sw_trace_nd(0, mnl->mubar, mnl->epsbar);
-      }
+      if(mnl->type != NDCLOVERRATCOR && (mnl->kappa != mnl->kappa2
+                                       || (mnl->type == NDDETRATIO 
+                                           && (mnl->mubar != mnl->mubar2 || mnl->epsbar != mnl->epsbar2))
+                                       || (mnl->type != NDDETRATIO
+                                           && (mnl->mu != mnl->mu2)))) {
+        double c_sw = mnl->c_sw;
+        if(c_sw < 0.) c_sw = 0.;
         
-      sw_term( (const su3**) hf.gaugefield, mnl->kappa2, c_sw);
-      if(mnl->type != NDDETRATIO) {
-        trlog[j] -= -sw_trace(0, mnl->mu2);
-      }
-      else {
-        trlog[j] -= -sw_trace_nd(0, mnl->mubar2, mnl->epsbar2);
-      }
+        sw_term( (const su3**) hf.gaugefield, mnl->kappa, c_sw); 
+        if(mnl->type != NDDETRATIO) {
+          trlog[j] = -sw_trace(0, mnl->mu);
+        }
+        else {
+          trlog[j] = -sw_trace_nd(0, mnl->mubar, mnl->epsbar);
+        }
+        
+        sw_term( (const su3**) hf.gaugefield, mnl->kappa2, c_sw);
+        if(mnl->type != NDDETRATIO) {
+          trlog[j] -= -sw_trace(0, mnl->mu2);
+        }
+        else {
+          trlog[j] -= -sw_trace_nd(0, mnl->mubar2, mnl->epsbar2);
+        }
+      } else
+        trlog[j] = 0.;
     }
     else {
       trlog[j] = 0.;
@@ -96,19 +104,19 @@ void reweighting_factor(const int N, const int nstore) {
           random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS);
           mnl->energy0 = square_norm(mnl->pf, n, 1);
         }
-	if(g_proc_id == 0 && g_debug_level > 1) {
-	  printf("# monomial[%d] %s, energy0 = %e\n", j, mnl->name, mnl->energy0);
-	}
-	if(mnl->type == NDDETRATIO) {
+	if(mnl->type == NDDETRATIO || mnl->type == NDCLOVERRATCOR) {
 	  if(mnl->even_odd_flag) {
 	    random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS);
-            mnl->energy0 += square_norm(mnl->pf, n/2, 1);
+            mnl->energy0 += square_norm(mnl->pf2, n/2, 1);
 	  }
 	  else {
-            random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS);
+            random_spinor_field_lexic(mnl->pf2, mnl->rngrepro, RN_GAUSS);
             mnl->energy0 += square_norm(mnl->pf2, n, 1);
           }
 	}
+	if(g_proc_id == 0 && g_debug_level > 1) {
+	  printf("# monomial[%d] %s, energy0 = %e\n", j, mnl->name, mnl->energy0);
+	}
       }
     }
 
diff --git a/smearing/hex_stout_exclude_one.c b/smearing/hex_stout_exclude_one.c
index 4071e29a8..5327fde2e 100644
--- a/smearing/hex_stout_exclude_one.c
+++ b/smearing/hex_stout_exclude_one.c
@@ -1,9 +1,10 @@
 #include "hex.ih"
+#include "global.h"
 
 void stout_exclude_one(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in)
 {
   static su3 tmp;
-  
+
 #define _MULTIPLY_AND_EXPONENTIATE(x, principal, component) \
   { \
     _su3_times_su3d(tmp, staples[component / 4][x][component % 4], buff_in[x][principal]); \
diff --git a/solver/Makefile.in b/solver/Makefile.in
index 7c74903d2..acc712794 100644
--- a/solver/Makefile.in
+++ b/solver/Makefile.in
@@ -45,7 +45,7 @@ libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \
                     dirac_operator_eigenvectors	spectral_proj \
                     jdher_su3vect cg_her_su3vect eigenvalues_Jacobi \
 		    mcr cr mcr4complex bicg_complex monomial_solve \
-		    solver_types
+		    solver_types init_guess
 
 libsolver_OBJECTS = $(addsuffix .o, ${libsolver_TARGETS})
 
diff --git a/solver/cg_her_nd.c b/solver/cg_her_nd.c
index e0fe53411..ab3bfe608 100644
--- a/solver/cg_her_nd.c
+++ b/solver/cg_her_nd.c
@@ -48,6 +48,7 @@
 #include "su3.h"
 #include "linalg_eo.h"
 #include "start.h"
+#include "gettime.h"
 #include "solver/matrix_mult_typedef_nd.h"
 #include "sub_low_ev.h"
 #include "solver_field.h"
@@ -60,6 +61,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   double normsp, normsq, pro, err, alpha_cg, beta_cg, squarenorm;
   int iteration;
   double err1, err2;
+  double atime, etime, flops;
   spinor ** up_field = NULL;
   spinor ** dn_field = NULL;  
   const int nr_sf = 5;
@@ -67,6 +69,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   init_solver_field(&up_field, VOLUMEPLUSRAND, nr_sf);
   init_solver_field(&dn_field, VOLUMEPLUSRAND, nr_sf);
 
+  atime = gettime();
   squarenorm = square_norm(Q_up, N, 1);
   squarenorm+= square_norm(Q_dn, N, 1);
   /*        !!!!   INITIALIZATION    !!!! */
@@ -130,12 +133,7 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
     }
 
     if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) {
-      assign(P_up, up_field[0], N);
-      assign(P_dn, dn_field[0], N);
-      g_sloppy_precision = 0;
-      finalize_solver(up_field, nr_sf);
-      finalize_solver(dn_field, nr_sf);
-      return(iteration+1);
+      break;
     }
 #ifdef _USE_HALFSPINOR
     if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1))) {
@@ -156,10 +154,16 @@ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * c
   assign(P_up, up_field[0], N);
   assign(P_dn, dn_field[0], N);
   g_sloppy_precision = 0;  
+
+  etime = gettime();
+  if(g_debug_level > 0 && g_proc_id == 0) {
+    printf("# CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); 
+  }
   
   finalize_solver(up_field, nr_sf);
   finalize_solver(dn_field, nr_sf);
-  return(-1);
+  if(iteration > max_iter) return(-1);
+  return(iteration);
 }
 
 
diff --git a/solver/cg_mms_tm.c b/solver/cg_mms_tm.c
index c438e03ca..a1616affb 100644
--- a/solver/cg_mms_tm.c
+++ b/solver/cg_mms_tm.c
@@ -63,7 +63,7 @@ static void free_mms_tm();
 
 /* P output = solution , Q input = source */
 int cg_mms_tm(spinor ** const P, spinor * const Q,
-		 solver_params_t * solver_params, double * cgmms_reached_prec) {
+		 solver_params_t * solver_params) {
 
   static double normsq, pro, err, squarenorm;
   int iteration, N = solver_params->sdim, no_shifts = solver_params->no_shifts;
@@ -143,13 +143,22 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
       // falls below a threshold
       // this is useful for computing time and needed, because otherwise
       // zita might get smaller than DOUBLE_EPS and, hence, zero
-      if(iteration > 0 && (iteration % 20 == 0) && (im == no_shifts-1)) {
-	double sn = square_norm(ps_mms_solver[im-1], N, 1);
-	if(alphas[no_shifts-1]*alphas[no_shifts-1]*sn <= solver_params->squared_solver_prec) {
+      if(iteration > 0 && (iteration % 10 == 0) && (im == no_shifts-1)) {
+	double sn = square_norm(ps_mms_solver[no_shifts-2], N, 1);
+        err = alphas[no_shifts-1]*alphas[no_shifts-1]*sn;
+        // while because more than one shift could be converged
+	while(((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+              ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0))) {
 	  no_shifts--;
 	  if(g_debug_level > 2 && g_proc_id == 0) {
 	    printf("# CGMMS: at iteration %d removed one shift, %d remaining\n", iteration, no_shifts);
       	  }
+          if(no_shifts>1) {
+            sn = square_norm(ps_mms_solver[no_shifts-2], N, 1);
+            err = alphas[no_shifts-1]*alphas[no_shifts-1]*sn;
+          } else {
+            break;
+          }
 	}
       }
     }
@@ -170,9 +179,7 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
     if( ((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
         ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0)) ||
         (iteration == solver_params->max_iter -1) ) {
-      /* FIXME temporary output of precision until a better solution can be found */
-      *cgmms_reached_prec = err;
-      break;
+        break;
     }
 
     /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i))
@@ -195,7 +202,7 @@ int cg_mms_tm(spinor ** const P, spinor * const Q,
   if(g_debug_level > 0 && g_proc_id == 0) {
     printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); 
   }
-  
+
   finalize_solver(solver_field, nr_sf);
   return(iteration);
 }
diff --git a/solver/cg_mms_tm.h b/solver/cg_mms_tm.h
index b4adae1be..914e928c6 100644
--- a/solver/cg_mms_tm.h
+++ b/solver/cg_mms_tm.h
@@ -28,6 +28,6 @@
 #include "matrix_mult_typedef.h"
 #include "su3.h"
 
-int cg_mms_tm(spinor ** const P,spinor * const Q, solver_params_t * const params, double * reached_prec);
+int cg_mms_tm(spinor ** const P,spinor * const Q, solver_params_t * const params);
 
 #endif
diff --git a/solver/cg_mms_tm_nd.c b/solver/cg_mms_tm_nd.c
index 7af67d6a5..55050041c 100644
--- a/solver/cg_mms_tm_nd.c
+++ b/solver/cg_mms_tm_nd.c
@@ -72,6 +72,8 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   double atime, etime;
   const int nr_sf = 4;
 
+  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: solving %d shifts\n", shifts);
+
   atime = gettime();
   if(solver_params->sdim == VOLUME) {
     init_solver_field(&solver_field, VOLUMEPLUSRAND, 2*nr_sf);
@@ -91,12 +93,12 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   alphas[0] = 1.0;
   betas[0] = 0.0;
   sigma[0] = solver_params->shifts[0]*solver_params->shifts[0];
-  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", 0, sigma[0]);
+  if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", 0, solver_params->shifts[0]);
 
   /* currently only implemented for P=0 */
   for(int im = 1; im < shifts; im++) {
     sigma[im] = solver_params->shifts[im]*solver_params->shifts[im] - sigma[0];
-    if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", im, sigma[im]);
+    if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", im, solver_params->shifts[im]);
     // these will be the result spinor fields
     zero_spinor_field(Pup[im], N);
     zero_spinor_field(Pdn[im], N);
@@ -155,14 +157,36 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
       // falls below a threshold
       // this is useful for computing time and needed, because otherwise
       // zita might get smaller than DOUBLE_EPS and, hence, zero
-      if(iteration > 0 && (iteration % 20 == 0) && (im == shifts-1)) {
-	double sn = square_norm(ps_mms_solver[2*im], N, 1);
-	sn += square_norm(ps_mms_solver[2*im+1], N, 1);
-	if(alphas[shifts-1]*alphas[shifts-1]*sn <= solver_params->squared_solver_prec) {
+      if(iteration > 0 && (iteration % 10 == 0) && (im == shifts-1)) {
+        double sn = square_norm(ps_mms_solver[2*(shifts-1)], N, 1);
+        sn += square_norm(ps_mms_solver[2*(shifts-1)+1], N, 1);
+        err = alphas[shifts-1]*alphas[shifts-1]*sn;
+	while(((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) ||
+              ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0))) {
 	  shifts--;
+          // for testing purpose
+	  if(g_debug_level > 3) {
+	    if (g_proc_id == 0) printf("# CGMMSND: residual of remaining shifts\n");
+	    if (g_proc_id == 0) printf("#\t id\t\t shift\t residual\n");
+            for(int is = shifts; is>0; is--) {
+              sn = square_norm(ps_mms_solver[2*is], N, 1);
+              sn += square_norm(ps_mms_solver[2*is+1], N, 1);
+              err = alphas[is]*alphas[is]*sn;
+              if (g_proc_id == 0) printf("#\t %d\t\t %e\t %e\n", is, solver_params->shifts[is], solver_params->rel_prec ? err/squarenorm : err);
+            }
+            if (g_proc_id == 0) printf("#\t %d\t\t %e\t %e\n", 0, solver_params->shifts[0], solver_params->rel_prec ? normsq/squarenorm : normsq);
+	  }
 	  if(g_debug_level > 2 && g_proc_id == 0) {
-	    printf("# CGMMSND: at iteration %d removed one shift, %d remaining\n", iteration, shifts);
+	    printf("# CGMMSND: at iteration %d removed one shift with residual %e. %d shifts remaining\n", iteration, solver_params->rel_prec ? err/squarenorm : err, shifts);
 	  }
+          // computing next shift residual and looping for all the converged
+          if(shifts>1) {
+            sn = square_norm(ps_mms_solver[2*(shifts-1)], N, 1);
+            sn += square_norm(ps_mms_solver[2*(shifts-1)+1], N, 1);
+            err = alphas[shifts-1]*alphas[shifts-1]*sn;
+          } else {
+            break;
+          }
 	}
       }
     }
@@ -208,9 +232,9 @@ int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn,
   if(iteration == solver_params->max_iter -1) iteration = -1;
   else iteration++;
   if(g_debug_level > 0 && g_proc_id == 0) {
-    printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); 
+    printf("# CGMMSND (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); 
   }
-  
+
   finalize_solver(solver_field, 2*nr_sf);
   return(iteration);
 }
diff --git a/solver/dirac_operator_eigenvectors.c b/solver/dirac_operator_eigenvectors.c
index bdef1ec24..102c944bc 100644
--- a/solver/dirac_operator_eigenvectors.c
+++ b/solver/dirac_operator_eigenvectors.c
@@ -1137,7 +1137,7 @@ void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k
   double q[8];
   double p_mu[4];
   double p_mu_t[4];
-  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy;
+  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor;
   double *fv_=(double*)fv;
   int index;
 
@@ -1165,10 +1165,10 @@ void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k
 
   /* multiply with i ... */
   /* .. so first swap re <-> im .. */
-  SWAP(q[0],q[1],swap_dummy);
-  SWAP(q[2],q[3],swap_dummy);
-  SWAP(q[4],q[5],swap_dummy);
-  SWAP(q[6],q[7],swap_dummy);
+  SWAP(q[0],q[1]);
+  SWAP(q[2],q[3]);
+  SWAP(q[4],q[5]);
+  SWAP(q[6],q[7]);
 
   /* and multiply new real part (former imag part) with -1 */
   q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor;
@@ -1216,7 +1216,7 @@ void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsi
   double q[8];
   double p_mu[4];
   double p_mu_t[4];
-  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy;
+  double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor;
 
   calcPmuLattice(rawp,p_mu,tt,ll);
   psq=p_mu[0]*p_mu[0]+
@@ -1242,10 +1242,10 @@ void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsi
 
   /* multiply with i ... */
   /* .. so first swap re <-> im .. */
-  SWAP(q[0],q[1],swap_dummy);
-  SWAP(q[2],q[3],swap_dummy);
-  SWAP(q[4],q[5],swap_dummy);
-  SWAP(q[6],q[7],swap_dummy);
+  SWAP(q[0],q[1]);
+  SWAP(q[2],q[3]);
+  SWAP(q[4],q[5]);
+  SWAP(q[6],q[7]);
 
   /* and multiply new real part (former imag part) with -1 */
   q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor;
@@ -2092,7 +2092,7 @@ int * makeDiagFalloffPmuMap(int n,int maxdmanhat){
 
     for(int i = 0;i<10;i++){
       ranlxd(r,2);
-      SWAP(drawp[(int)(r[0]*4.)],drawp[(int)(r[1]*4.)],r[2]);
+      SWAP(drawp[(int)(r[0]*4.)],drawp[(int)(r[1]*4.)]);
 
   }
     fprintf(drawpStatFile," %d %d %d %d\n",drawp[0],drawp[1],drawp[2],drawp[3]);
diff --git a/solver/dirac_operator_eigenvectors.h b/solver/dirac_operator_eigenvectors.h
index da8f10187..cc27dc8f8 100644
--- a/solver/dirac_operator_eigenvectors.h
+++ b/solver/dirac_operator_eigenvectors.h
@@ -40,11 +40,6 @@
 #define M_PI  3.14159265358979323846
 #endif
 
-#define SWAP(x,y,d)\
-  d=x;\
-  x=y;\
-  y=d;
-
 #define min(x,y)\
   ((x<y)?x:y)
 #define max(x,y)\
diff --git a/solver/init_guess.c b/solver/init_guess.c
new file mode 100644
index 000000000..9934759bf
--- /dev/null
+++ b/solver/init_guess.c
@@ -0,0 +1,190 @@
+/***********************************************************************
+ *
+ *
+ * Copyright (C) 2016 Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "global.h"
+#include "su3.h"
+#include "gamma.h"
+#include "linalg_eo.h"
+#include "start.h"
+#include "gettime.h"
+#include "solver/solver.h"
+#include "solver_field.h"
+#include "operator/tm_operators.h"
+#include "operator/tm_operators_nd.h"
+#include "init_guess.h"
+#include <io/params.h>
+
+int init_guess_mms(spinor ** const P, spinor * const Q,
+                   int shift, solver_params_t * const solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(P[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]*shifts[k]-shifts[shift]*shifts[shift])/
+                   (shifts[k]*shifts[k]-shifts[j]*shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(P[shift], coeff, P[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(P[shift], P[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_mu3 = g_mu3;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 1);
+    }
+
+    g_mu3 = solver_params->shifts[shift]; 
+    solver_params->M_psi( temp[0], P[shift]);
+    g_mu3 = old_g_mu3;
+
+    diff( temp[0], temp[0], Q, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)/square_norm(Q, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 1);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+
+}
+
+int init_guess_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      int shift, solver_params_t * solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(Pup[shift], solver_params->sdim);
+    zero_spinor_field(Pdn[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]*shifts[k]-shifts[shift]*shifts[shift])/
+                   (shifts[k]*shifts[k]-shifts[j]*shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(Pup[shift], coeff, Pup[j], solver_params->sdim);
+        mul_r(Pdn[shift], coeff, Pdn[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(Pup[shift], Pup[j], coeff, solver_params->sdim);
+        assign_add_mul_r(Pdn[shift], Pdn[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_shift = g_shift;
+    matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+      f = Qsw_pm_ndpsi_shift;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 2);
+    }
+
+    g_shift = shifts[shift]*shifts[shift]; 
+    f( temp[0], temp[1], Pup[shift], Pdn[shift]);
+    g_shift = old_g_shift;
+
+    diff( temp[0], temp[0], Qup, solver_params->sdim);
+    diff( temp[1], temp[1], Qdn, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)+square_norm(temp[1], solver_params->sdim, 1))/
+      sqrt(square_norm(Qup, solver_params->sdim, 1)+square_norm(Qdn, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 2);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS ND: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+}
+
+int init_guess_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                           spinor * const Qup, spinor * const Qdn, 
+                           int shift, solver_params_t * solver_params) {
+  double * shifts=solver_params->shifts;
+  int no_shifts = solver_params->no_shifts;
+  if(shift==no_shifts-1) {
+    zero_spinor_field(Pup[shift], solver_params->sdim);
+    zero_spinor_field(Pdn[shift], solver_params->sdim);
+  } else {
+    double coeff;
+    for( int j = no_shifts-1; j > shift; j-- ) {
+      coeff = 1;
+      for( int k = no_shifts-1; k > shift; k-- ) {
+        if(j!=k)
+          coeff *= (shifts[k]-shifts[shift])/(shifts[k]-shifts[j]);
+      }
+      if(j==no_shifts-1) {
+        mul_r(Pup[shift], coeff, Pup[j], solver_params->sdim);
+        mul_r(Pdn[shift], coeff, Pdn[j], solver_params->sdim);
+      } else {
+        assign_add_mul_r(Pup[shift], Pup[j], coeff, solver_params->sdim);
+        assign_add_mul_r(Pdn[shift], Pdn[j], coeff, solver_params->sdim);
+      }
+    }
+  }
+  if(g_debug_level > 2){
+    double old_g_shift = g_shift;
+    matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_add_Ishift;
+    spinor** temp;
+    if(solver_params->sdim == VOLUME/2) {
+      init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    } else {
+      init_solver_field(&temp, VOLUMEPLUSRAND, 2);
+    }
+
+    g_shift = shifts[shift]*shifts[shift]; 
+    f( temp[0], temp[1], Pup[shift], Pdn[shift]);
+    g_shift = old_g_shift;
+
+    diff( temp[0], temp[0], Qup, solver_params->sdim);
+    diff( temp[1], temp[1], Qdn, solver_params->sdim);
+    double res = sqrt(square_norm(temp[0], solver_params->sdim, 1)+square_norm(temp[1], solver_params->sdim, 1))/
+      sqrt(square_norm(Qup, solver_params->sdim, 1)+square_norm(Qdn, solver_params->sdim, 1));
+      
+    finalize_solver(temp, 2);
+    if(g_proc_id == 0)
+      printf("INITIAL GUESS ND PLUS: shift id=%d value=%e  relative residual: %e\n",shift,shifts[shift],res); 
+  }
+}
diff --git a/solver/init_guess.h b/solver/init_guess.h
new file mode 100644
index 000000000..8ae29e80c
--- /dev/null
+++ b/solver/init_guess.h
@@ -0,0 +1,39 @@
+/***********************************************************************
+ *
+ *
+ * Copyright (C) 2016 Simone Bacchio
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ***********************************************************************/
+
+#ifndef _INIT_GUESS_H
+#define _INIT_GUESS_H
+
+#include"su3.h"
+#include"solver.h"
+
+int init_guess_mms(spinor ** const P, spinor * const Q,
+                   int shift, solver_params_t * const params);
+
+int init_guess_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      int shift, solver_params_t * solver_params);
+
+int init_guess_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                           spinor * const Qup, spinor * const Qdn, 
+                           int shift, solver_params_t * solver_params);
+#endif
diff --git a/solver/matrix_mult_typedef_nd.h b/solver/matrix_mult_typedef_nd.h
index b9d8b814b..ce298c946 100644
--- a/solver/matrix_mult_typedef_nd.h
+++ b/solver/matrix_mult_typedef_nd.h
@@ -29,6 +29,7 @@
 #define _MATRIX_MULT_TYPEDEF_ND_H
 
 typedef void (*matrix_mult_nd)(spinor * const, spinor * const,spinor * const, spinor * const);
+typedef void (*matrix_mult_full_nd)(spinor * const, spinor * const,spinor * const, spinor * const,spinor * const, spinor * const,spinor * const, spinor * const);
 typedef void (*matrix_mult_nd32)(spinor32 * const, spinor32 * const, spinor32 * const, spinor32 * const);
 
 #endif
diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
index 1a98e800a..8b060e36e 100644
--- a/solver/monomial_solve.c
+++ b/solver/monomial_solve.c
@@ -27,7 +27,11 @@
  *
  *
  *   int solve_degenerate(spinor * const P, spinor * const Q, const int max_iter, 
-           double eps_sq, const int rel_prec, const int N, matrix_mult f)
+ *                       double eps_sq, const int rel_prec, const int N, matrix_mult f)
+ *
+ *   int solve_mms_tm(spinor ** const P, spinor * const Q,
+ *                    solver_params_t * solver_params)  
+ *
  *   int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
  *                    spinor * const Qup, spinor * const Qdn, 
  *                    solver_params_t * solver_params)  
@@ -39,7 +43,9 @@
 # include<config.h>
 #endif
 #include "global.h"
+#include "start.h"
 #include "read_input.h"
+#include "default_input_values.h"
 #include "linalg/mul_gamma5.h"
 #include "linalg/diff.h"
 #include "linalg/square_norm.h"
@@ -49,6 +55,7 @@
 #include "phmc.h"
 #include "solver/solver.h"
 #include "solver/solver_field.h"
+#include "solver/init_guess.h"
 #include "solver/matrix_mult_typedef.h"
 #include "solver/solver_types.h"
 #include "solver/solver_params.h"
@@ -60,12 +67,14 @@
 #include "operator/clovertm_operators_32.h"
 #include "misc_types.h"
 #include "monomial_solve.h"
+#include "linalg_eo.h"
 #ifdef DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
 #ifdef TM_USE_QPHIX
 #include "qphix_interface.h"
 #endif
+#include "fatal_error.h"
 
 #include <io/params.h>
 #include <io/spinor.h>
@@ -87,14 +96,15 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
                      const int max_iter, double eps_sq, const int rel_prec, 
                      const int N, matrix_mult f, int solver_type){
   int iteration_count = 0;
-  int use_solver = solver_type;
-  // set up for performing checks of the residual
-  // the temporary field is also required by the QPhiX solve
+
+  // temporary field required by the QPhiX solve or by residual check
   spinor** temp;
-  if(g_debug_level > 0 || solver_params.external_inverter == QPHIX_INVERTER){
+  if(g_debug_level > 2 || solver_params.external_inverter == QPHIX_INVERTER){
     init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
   }
 
+  solver_params.use_initial_guess = 0;
+
 #ifdef TM_USE_QPHIX
   if(solver_params.external_inverter == QPHIX_INVERTER){
     // using CG for the HMC, we always want to have the solution of (Q Q^dagger) x = b, which is equivalent to
@@ -106,21 +116,20 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
     mul_gamma5(P, VOLUME/2);
   } else
 #endif
-
-  if(use_solver == MIXEDCG || use_solver == RGMIXEDCG){
+  if(solver_type == MIXEDCG || solver_type == RGMIXEDCG){
     // the default mixed solver is rg_mixed_cg_her
     int (*msolver_fp)(spinor * const, spinor * const, solver_params_t, 
                       const int, double, const int, const int, matrix_mult, matrix_mult32) = rg_mixed_cg_her;
 
     // but it might be necessary at some point to use the old version
-    if(use_solver == MIXEDCG){
+    if(solver_type == MIXEDCG){
       msolver_fp = mixed_cg_her;
     }
 
     // FIXME: this GPU stuff needs to go...
     if(usegpu_flag){   
       #ifdef HAVE_GPU     
-	      #ifdef TEMPORALGAUGE
+        #ifdef TEMPORALGAUGE
           to_temporalgauge(g_gauge_field, Q , P);
         #endif          
         iteration_count = linsolve_eo_gpu(P, Q, max_iter, eps_sq, rel_prec, N, f);
@@ -132,34 +141,33 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
     }
     else{
       if(f==Qtm_pm_psi){   
-        iteration_count =  msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qtm_pm_psi_32);
-      }
-      else if(f==Q_pm_psi){     
-	iteration_count =  msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Q_pm_psi_32);
+        iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qtm_pm_psi_32);
+      } else if(f==Q_pm_psi){     
+	iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Q_pm_psi_32);
       } else if(f==Qsw_pm_psi){
         copy_32_sw_fields();
         iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qsw_pm_psi_32);
       } else {
         if(g_proc_id==0) printf("Warning: 32 bit matrix not available. Falling back to CG in 64 bit\n"); 
-        use_solver = CG;
+        solver_type = CG;
       }
     }
   } 
-  else if(use_solver == CG){
+  else if(solver_type == CG){
     iteration_count =  cg_her(P, Q, max_iter, eps_sq, rel_prec, N, f);
   }
-  else if(use_solver == BICGSTAB){
+  else if(solver_type == BICGSTAB){
      iteration_count =  bicgstab_complex(P, Q, max_iter, eps_sq, rel_prec, N, f);     
   }
 #ifdef DDalphaAMG 
-  else if (use_solver == MG)
+  else if (solver_type == MG)
     iteration_count =  MG_solver(P, Q, eps_sq, max_iter,rel_prec, N , g_gauge_field, f);
 #endif     
   else{
-    if(g_proc_id==0) printf("Error: solver not allowed for degenerate solve. Aborting...\n");
-    exit(2);
+    fatal_error("Error: solver not allowed for degenerate solve. Aborting...\n", "solve_degenerate");
   }
-  if(g_debug_level > 0){
+
+  if(g_debug_level > 2){
     f(temp[0], P);
     diff(temp[0], temp[0], Q, VOLUME/2);
     double diffnorm = square_norm(temp[0], VOLUME/2, 1); 
@@ -167,21 +175,27 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
       printf("# solve_degenerate residual check: %e\n", diffnorm);
     }
   }
-  if(g_debug_level > 0 || solver_params.external_inverter == QPHIX_INVERTER){
+  if(g_debug_level > 2 || solver_params.external_inverter == QPHIX_INVERTER){
     finalize_solver(temp, 1);
   }
+
   return(iteration_count);
 }
 
-int solve_mshift_oneflavour(spinor ** const P, spinor * const Q, solver_params_t* solver_params){
-  int iteration_count = 0;
+int solve_mms_tm(spinor ** const P, spinor * const Q,
+                 solver_params_t * solver_params){ 
+  int iteration_count = 0; 
+
+  solver_params->use_initial_guess = 0;
+
+  // temporary field required by the QPhiX solve or by residual check
   spinor ** temp;
-  if(g_debug_level > 0 || solver_params->external_inverter == QPHIX_INVERTER){
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER  && solver_params->type != MG)){
     init_solver_field(&temp, VOLUMEPLUSRAND/2, 1);
   }
-  
+
 #ifdef TM_USE_QPHIX
-  if( solver_params->external_inverter == QPHIX_INVERTER ){
+  if( solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG ){
     gamma5(temp[0], Q, VOLUME/2);
     iteration_count = invert_eo_qphix_oneflavour_mshift(P, temp[0],
                                                         solver_params->max_iter, solver_params->squared_solver_prec,
@@ -194,39 +208,157 @@ int solve_mshift_oneflavour(spinor ** const P, spinor * const Q, solver_params_t
     }
   } else
 #endif // TM_USE_QPHIX
-  if( solver_params->external_inverter == NO_EXT_INV ){
-    double reached_prec = -1.0;
-    iteration_count = cg_mms_tm(P, Q, solver_params, &reached_prec);
+  if (solver_params->type == CGMMS){
+    iteration_count = cg_mms_tm(P, Q, solver_params);
   }
+#ifdef DDalphaAMG
+  else if (solver_params->type == MG) {
+    // if the mg_mms_mass is larger than the smallest shift we use MG
+    if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) { 
+      int nshifts = solver_params->no_shifts;
+      int mg_nshifts = mg_no_shifts > nshifts ? nshifts:mg_no_shifts;
+      // if the mg_mms_mass is smaller than the larger shifts, we use CGMMS for those
+      // in case mg_no_shifts is used, then mg_mms_mass = 0
+      if(mg_mms_mass >= solver_params->shifts[0]) {
+        mg_nshifts = solver_params->no_shifts;
+        while (mg_mms_mass < solver_params->shifts[mg_nshifts-1]) { mg_nshifts--; }
+      }
+      // Number of initial guesses provided by gcmms
+      // README: tunable value. 1 it's fine for now.
+      int  no_cgmms_init_guess = 1;
+      if(no_cgmms_init_guess > mg_nshifts) {
+        no_cgmms_init_guess = mg_nshifts;
+      }
+#ifdef TM_USE_QPHIX
+      if( solver_params->external_inverter == QPHIX_INVERTER && mg_nshifts < nshifts ) {
+        // TODO: no initial guess option with QphiX
+        no_cgmms_init_guess = 0;
+        spinor ** P_cg = P+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params->type = CGMMS;
+        gamma5(temp[0], Q, VOLUME/2);
+        iteration_count = invert_eo_qphix_oneflavour_mshift(P, temp[0],
+                                                            solver_params->max_iter, solver_params->squared_solver_prec,
+                                                            solver_params->type, solver_params->rel_prec,
+                                                            *solver_params,
+                                                            solver_params->sloppy_precision,
+                                                            solver_params->compression_type);
+        for( int shift = 0; shift < solver_params->no_shifts; shift++) {
+          mul_gamma5(P[shift], VOLUME/2);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params->type = MG;
+        } else
+#endif // TM_USE_QPHIX  
+      if (mg_nshifts < nshifts) {
+        spinor ** P_cg = P+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params->type = CGMMS;
+        // switching last shift. We run CGMMS for the shift we want to solve.
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(P_cg[0], P_cg[no_cgmms_init_guess]);
+        }
+        iteration_count = solve_mms_tm( P_cg, Q, solver_params );
+        // Switching back last shift
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(P_cg[0], P_cg[no_cgmms_init_guess]);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params->type = MG;
+      } else {
+        no_cgmms_init_guess = 0;
+      }
 
-  if(g_debug_level > 0){
-    // FIXME: in the shift-by-shift branch, the shifted operator exists explicitly and could be used to 
-    // truly check the residual here
-    solver_params->M_psi(temp[0], P[0]);
-    diff(temp[0], temp[0], Q, VOLUME/2);
-    double diffnorm = square_norm(temp[0], VOLUME/2, 1); 
-    if( g_proc_id == 0 ){
-      printf("# solve_mshift_oneflavour residual check: %e\n", diffnorm);
-      printf("# NOTE that this currently repors the residual for the *unishfted* operator!\n");
+      for(int i = mg_nshifts-1; i>=0; i--){
+        // preparing initial guess
+        if(i<mg_nshifts-no_cgmms_init_guess)
+          init_guess_mms(P, Q, i, solver_params);
+        g_mu3 = solver_params->shifts[i]; 
+        iteration_count += MG_solver( P[i], Q, solver_params->squared_solver_prec, solver_params->max_iter,
+                                         solver_params->rel_prec, solver_params->sdim, g_gauge_field, solver_params->M_psi );
+        g_mu3 = _default_g_mu3;
+      }
+    } else {
+      iteration_count = cg_mms_tm( P, Q, solver_params );
+    }
+  }
+#endif
+  else if (solver_params->type == RGMIXEDCG){
+    matrix_mult32 f32  = Qtm_pm_psi_32;
+    if( solver_params->M_psi == Qsw_pm_psi ){ 
+      f32  = Qsw_pm_psi_32;
+    }
+    iteration_count = 0;
+    // solver_params_t struct needs to be passed to all solvers except for cgmms, so we need to construct it here
+    // and set the one relevant parameter
+    solver_params_t temp_params;
+    temp_params.mcg_delta = _default_mixcg_innereps;
+    double iter_local = 0;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms(P, Q, i, solver_params); 
+      solver_params->use_initial_guess = 1;
+     
+      // inverting
+      g_mu3 = solver_params->shifts[i]; 
+      iter_local = rg_mixed_cg_her( P[i], Q, temp_params, solver_params->max_iter,
+                                    solver_params->squared_solver_prec, solver_params->rel_prec, solver_params->sdim,
+                                    solver_params->M_psi, f32);
+      g_mu3 = _default_g_mu3;
+      solver_params->use_initial_guess = 0;
+      if(iter_local == -1){
+        return(-1);
+      } else {
+        iteration_count += iter_local;
+      }
     }
+  } else {
+    fatal_error("Error: solver not allowed for TM mms solve. Aborting...\n", "solve_mms_tm");
   }
-  if(g_debug_level > 0 || solver_params->external_inverter == QPHIX_INVERTER){
+
+  if(g_debug_level > 2){
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      g_mu3 = solver_params->shifts[shift]; 
+      solver_params->M_psi(temp[0], P[shift]);
+      g_mu3 = _default_g_mu3;
+      diff(temp[0], temp[0], Q, VOLUME/2);
+      double diffnorm = square_norm(temp[0], VOLUME/2, 1); 
+      if( g_proc_id == 0 ){
+        printf("# solve_mms_tm residual check: shift %d, res. %e\n", shift, diffnorm);
+      }
+    }
+  }
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG)){
     finalize_solver(temp, 1);
   }
-  return iteration_count;
+
+  return(iteration_count);
 }
 
 int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
                  spinor * const Qup, spinor * const Qdn, 
                  solver_params_t * solver_params){ 
-  int iteration_count = 0;
-  
-  spinor** temp;
-  if(g_debug_level > 0 || solver_params->external_inverter == QPHIX_INVERTER){
+  int iteration_count = 0; 
+  solver_params->use_initial_guess = 0;
+
+  // temporary field required by the QPhiX solve or by residual check
+  spinor ** temp;
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG)){
     init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
   }
+
 #ifdef TM_USE_QPHIX
-  if(solver_params->external_inverter == QPHIX_INVERTER){
+  if(solver_params->external_inverter == QPHIX_INVERTER && solver_params->type != MG){
     //  gamma5 (M.M^dagger)^{-1} gamma5 = [ Q(+mu,eps) Q(-mu,eps) ]^{-1}
     gamma5(temp[0], Qup, VOLUME/2);
     gamma5(temp[1], Qdn, VOLUME/2);
@@ -248,43 +380,221 @@ int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn,
     }
   } else
 #endif //TM_USE_QPHIX
-  if(solver_params->external_inverter == NO_EXT_INV){
-    if(solver_params->type==MIXEDCGMMSND){
-      if(usegpu_flag){
-	#ifdef HAVE_GPU      
-	  #ifdef TEMPORALGAUGE
-	    to_temporalgauge_mms(g_gauge_field , Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
-	  #endif        
-	  iteration_count = dev_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);  
-	  #ifdef TEMPORALGAUGE
-	    from_temporalgauge_mms(Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
-	  #endif 
-	#endif
+  if(solver_params->type==MIXEDCGMMSND){
+    if(usegpu_flag){
+    #ifdef HAVE_GPU      
+      #ifdef TEMPORALGAUGE
+      to_temporalgauge_mms(g_gauge_field , Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
+      #endif        
+      iteration_count = dev_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);  
+      #ifdef TEMPORALGAUGE
+      from_temporalgauge_mms(Qup, Qdn, Pup, Pdn, solver_params->no_shifts);
+      #endif 
+    #endif
+    } else {
+      iteration_count = mixed_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
+    }
+  } else if (solver_params->type == CGMMSND){
+    iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
+  }
+#ifdef DDalphaAMG
+  else if (solver_params->type == MG) {
+    // if the mg_mms_mass is larger than the smallest shift we use MG
+    if (mg_no_shifts > 0 || mg_mms_mass >= solver_params->shifts[0]) { 
+
+      int nshifts = solver_params->no_shifts;
+      int mg_nshifts = mg_no_shifts > nshifts ? nshifts:mg_no_shifts;
+      // if the mg_mms_mass is smaller than the larger shifts, we use CGMMS for those
+      // in case mg_no_shifts is used, then mg_mms_mass = 0
+      if(mg_mms_mass >= solver_params->shifts[0]) {
+        mg_nshifts = nshifts;
+        while (mg_mms_mass < solver_params->shifts[mg_nshifts-1]) { mg_nshifts--; }
       }
-      else{
-	iteration_count = mixed_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
+      // Number of initial guesses provided by gcmms
+      // README: tunable value. 2 it's fine for now.
+      int no_cgmms_init_guess = 2;
+      if(no_cgmms_init_guess > mg_nshifts) {
+        no_cgmms_init_guess = mg_nshifts;
+      }
+#ifdef TM_USE_QPHIX
+      if(solver_params->external_inverter == QPHIX_INVERTER && mg_nshifts < nshifts){
+        // TODO: no initial guess option with QphiX
+        no_cgmms_init_guess = 0;
+        spinor ** Pup_cg = Pup+(mg_nshifts - no_cgmms_init_guess);
+        spinor ** Pdn_cg = Pdn+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params-> type = CGMMSND;
+        //  gamma5 (M.M^dagger)^{-1} gamma5 = [ Q(+mu,eps) Q(-mu,eps) ]^{-1}
+        gamma5(temp[0], Qup, VOLUME/2);
+        gamma5(temp[1], Qdn, VOLUME/2);
+        iteration_count = invert_eo_qphix_twoflavour_mshift(Pup_cg, Pdn_cg, temp[0], temp[1],
+                                                            solver_params->max_iter, solver_params->squared_solver_prec,
+                                                            solver_params->type, solver_params->rel_prec,
+                                                            *solver_params,
+                                                            solver_params->sloppy_precision,
+                                                            solver_params->compression_type);
+    
+        // the tmLQCD ND operator used for HMC is normalised by the inverse of the maximum eigenvalue
+        // so the inverse of Q^2 is normalised by the square of the maximum eigenvalue
+        // or, equivalently, the square of the inverse of the inverse
+        // note that in the QPhiX interface, we also correctly normalise the shifts
+        const double maxev_sq = (1.0/phmc_invmaxev)*(1.0/phmc_invmaxev);
+        for( int shift = 0; shift < solver_params->no_shifts; shift++){
+          mul_r_gamma5(Pup[shift], maxev_sq, VOLUME/2);
+          mul_r_gamma5(Pdn[shift], maxev_sq, VOLUME/2);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params-> type = MG;
+      } else
+#endif //TM_USE_QPHIX
+      if (mg_nshifts < nshifts) {
+        spinor ** Pup_cg = Pup+(mg_nshifts - no_cgmms_init_guess);
+        spinor ** Pdn_cg = Pdn+(mg_nshifts - no_cgmms_init_guess);
+        double * shifts_start = solver_params->shifts;
+        solver_params->no_shifts = nshifts - (mg_nshifts - no_cgmms_init_guess);
+        solver_params->shifts += (mg_nshifts - no_cgmms_init_guess);
+        solver_params-> type = CGMMSND;
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(Pup_cg[0], Pup_cg[no_cgmms_init_guess]);
+          SWAP(Pdn_cg[0], Pdn_cg[no_cgmms_init_guess]);
+        }
+        iteration_count = solve_mms_nd( Pup_cg, Pdn_cg, Qup, Qdn, solver_params );
+        // Switching back last shift
+        if (no_cgmms_init_guess > 0) {
+          SWAP(solver_params->shifts[0], solver_params->shifts[no_cgmms_init_guess]);
+          SWAP(Pup_cg[0], Pup_cg[no_cgmms_init_guess]);
+          SWAP(Pdn_cg[0], Pdn_cg[no_cgmms_init_guess]);
+        }
+        // Restoring solver_params
+        solver_params->no_shifts = nshifts;
+        solver_params->shifts = shifts_start;
+        solver_params-> type = MG;
+      } else {
+        no_cgmms_init_guess = 0;
+      }
+
+      matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+      if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+        f = Qsw_pm_ndpsi_shift;
+
+      for(int i = mg_nshifts-1; i>=0; i--){
+        // preparing initial guess
+        if(i<mg_nshifts-no_cgmms_init_guess)
+          init_guess_mms_nd(Pup, Pdn, Qup, Qdn, i, solver_params);
+        g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+        iteration_count += MG_solver_nd( Pup[i], Pdn[i], Qup, Qdn, solver_params->squared_solver_prec, solver_params->max_iter,
+                                         solver_params->rel_prec, solver_params->sdim, g_gauge_field, f );
+        g_shift = _default_g_shift;
       }
-    } else if (solver_params->type==CGMMSND){
-      iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_params);
     } else {
-      if(g_proc_id==0) printf("Error: solver not allowed for ND mms solve. Aborting...\n");
-      exit(2);      
+      iteration_count = cg_mms_tm_nd( Pup, Pdn, Qup, Qdn, solver_params );
     }
   }
-  if( g_debug_level > 0 ){
-    // FIXME: in the shift-by-shift branch, the shifted operator exists explicitly and could be used to 
-    // truly check the residual here
-    solver_params->M_ndpsi(temp[0], temp[1], Pup[0], Pdn[0]);
-    diff(temp[0], temp[0], Qup, VOLUME/2);
-    diff(temp[1], temp[1], Qdn, VOLUME/2);
-    double diffnorm = square_norm(temp[0], VOLUME/2, 1) + square_norm(temp[1], VOLUME/2, 1); 
-    if( g_proc_id == 0 ){
-      printf("# solve_mms_nd residual check: %e\n", diffnorm);
-      printf("# NOTE that this currently repors the residual for the *unishfted* operator!\n");
+#endif
+  else if (solver_params->type == RGMIXEDCG){
+    matrix_mult_nd   f    = Qtm_pm_ndpsi_shift;
+    matrix_mult_nd32 f32  = Qtm_pm_ndpsi_shift_32;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi ){ 
+      f    = Qsw_pm_ndpsi_shift;
+      f32  = Qsw_pm_ndpsi_shift_32;
+    }
+    iteration_count = 0;
+    // solver_params_t struct needs to be passed to all solvers except for cgmms, so we need to construct it here
+    // and set the one relevant parameter
+    solver_params_t temp_params;
+    temp_params.mcg_delta = _default_mixcg_innereps;
+    double iter_local = 0;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms_nd(Pup, Pdn, Qup, Qdn, i, solver_params);
+      solver_params->use_initial_guess = 1;
+      
+      // inverting
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      iter_local = rg_mixed_cg_her_nd( Pup[i], Pdn[i], Qup, Qdn, temp_params, solver_params->max_iter,
+                                       solver_params->squared_solver_prec, solver_params->rel_prec, solver_params->sdim, f, f32);
+      g_shift = _default_g_shift;
+      solver_params->use_initial_guess = 0;
+      if(iter_local == -1){
+        return(-1);
+      } else {
+        iteration_count += iter_local;
+      }
+    }
+  } else {
+    fatal_error("Error: solver not allowed for ND mms solve. Aborting...\n", "solve_mss_nd");
+  }
+
+  if( g_debug_level > 2 ){
+    for( int shift = 0; shift < solver_params->no_shifts; shift++){
+      matrix_mult_nd f = Qtm_pm_ndpsi_shift;
+      if( solver_params->M_ndpsi == Qsw_pm_ndpsi ) 
+        f = Qsw_pm_ndpsi_shift;
+      g_shift = solver_params->shifts[shift]*solver_params->shifts[shift]; 
+      f(temp[0], temp[1], Pup[shift], Pdn[shift]);
+      g_shift = _default_g_shift;
+      diff(temp[0], temp[0], Qup, VOLUME/2);
+      diff(temp[1], temp[1], Qdn, VOLUME/2);
+      double diffnorm = square_norm(temp[0], VOLUME/2, 1) + square_norm(temp[1], VOLUME/2, 1); 
+      if( g_proc_id == 0 ){
+        printf("# solve_mms_nd residual check: %e\n", diffnorm);
+      }
     }
   }
-  if( g_debug_level > 0 || solver_params->external_inverter == QPHIX_INVERTER ){
+  if(g_debug_level > 2 || (solver_params->external_inverter == QPHIX_INVERTER  && solver_params->type != MG)){
     finalize_solver(temp, 2);
   }
+
   return(iteration_count);
 }
+
+int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      solver_params_t * solver_params){ 
+
+  int iteration_count = 0; 
+
+#ifdef DDalphaAMG
+  // With MG we can solve directly the unsquared operator
+  if( solver_params->type == MG ){
+    matrix_mult_nd f = Qtm_tau1_ndpsi_add_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_add_Ishift;
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      // preparing initial guess
+      init_guess_mms_nd_plus(Pup, Pdn, Qup, Qdn, i, solver_params);
+  
+      // g_shift = shift^2 and then in Qsw_tau1_ndpsi_add_Ishift the square root is taken
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      iteration_count += MG_solver_nd( Pup[i], Pdn[i], Qup, Qdn, solver_params->squared_solver_prec,
+                                       solver_params->max_iter, solver_params->rel_prec, solver_params->sdim,
+                                       g_gauge_field, f );
+      g_shift = _default_g_shift;
+    }
+  } else 
+#endif
+  {
+    iteration_count = solve_mms_nd(Pup, Pdn, Qup, Qdn, solver_params);
+    
+    // apply operator for retrieving unsquared solution
+    matrix_mult_nd f = Qtm_tau1_ndpsi_sub_Ishift;
+    if( solver_params->M_ndpsi == Qsw_pm_ndpsi )
+      f = Qsw_tau1_ndpsi_sub_Ishift;
+    spinor** temp;
+    init_solver_field(&temp, VOLUMEPLUSRAND/2, 2);
+    for(int i = solver_params->no_shifts-1; i>=0; i--){
+      g_shift = solver_params->shifts[i]*solver_params->shifts[i]; 
+      f(temp[0],temp[1],Pup[i],Pdn[i]);
+      assign(Pup[i], temp[0], VOLUME/2);
+      assign(Pdn[i], temp[1], VOLUME/2);
+      g_shift = _default_g_shift;
+    }
+    finalize_solver(temp, 2);
+  }
+  return iteration_count;
+}
diff --git a/solver/monomial_solve.h b/solver/monomial_solve.h
index c069efa2f..6a42c4558 100644
--- a/solver/monomial_solve.h
+++ b/solver/monomial_solve.h
@@ -23,15 +23,18 @@
 #include "solver/matrix_mult_typedef.h"
 #include "solver/solver_params.h"
 #include "su3.h"
-
-int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params,
-                     const int max_iter, double eps_sq, const int rel_prec, 
-                     const int N, matrix_mult f, int solver_type);
-
-int solve_mshift_oneflavour(spinor ** const P, spinor * const Q, solver_params_t* solver_params);
-
+#include"solver/matrix_mult_typedef.h"
+#include"solver/solver_params.h"
+#include"su3.h"
+int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params, const int max_iter, 
+                     double eps_sq, const int rel_prec, const int N, matrix_mult f, int solver_type);
+int solve_mms_tm(spinor ** const P, spinor * const Q,
+                 solver_params_t * solver_params);
 int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, 
                  spinor * const Qup, spinor * const Qdn, 
                  solver_params_t * solver_params);
+int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn, 
+                      spinor * const Qup, spinor * const Qdn, 
+                      solver_params_t * solver_params);
 
 #endif
diff --git a/solver/rg_mixed_cg_her_nd.c b/solver/rg_mixed_cg_her_nd.c
index de5643f74..cf30c1eb2 100644
--- a/solver/rg_mixed_cg_her_nd.c
+++ b/solver/rg_mixed_cg_her_nd.c
@@ -28,8 +28,8 @@
  *
  * in:
  *   Q: source
- * inout:
- *   P: result (initial guess currently not supported)
+ * input:
+ *   P: result
  *
  * POSSIBLE IMPROVEMENTS
  * There are still quite a few things that can be tried to make it better,
@@ -193,7 +193,7 @@ int rg_mixed_cg_her_nd(spinor * const P_up, spinor * const P_dn, spinor * const
   int iter_in_sp = 0, iter_in_dp = 0, iter_out = 0;
   float rho_sp, delta = solver_params.mcg_delta;
   double beta_dp, rho_dp;
-  double sourcesquarenorm, target_eps_sq;
+  double sourcesquarenorm, guesssquarenorm, target_eps_sq;
 
   spinor *xhigh_up, *xhigh_dn, *rhigh_up, *rhigh_dn, *qhigh_up, *qhigh_dn, *phigh_up, *phigh_dn;
   spinor32 *x_up, *x_dn, *p_up, *p_dn, *q_up, *q_dn, *r_up, *r_dn;
@@ -248,17 +248,24 @@ int rg_mixed_cg_her_nd(spinor * const P_up, spinor * const P_dn, spinor * const
   if(g_debug_level > 0 && g_proc_id==0) 
     printf("#RG_Mixed CG_ND: N_outer: %d \n", N_outer);
   
-  // should compute real residual here, for now we always use a zero guess
   zero_spinor_field_32(x_up,N); zero_spinor_field_32(x_dn,N);
-  zero_spinor_field(P_up,N); zero_spinor_field(P_dn,N);
-  assign(phigh_up,Q_up,N); assign(phigh_dn,Q_dn,N);
-  assign(rhigh_up,Q_up,N); assign(rhigh_dn,Q_dn,N);
-  
-  rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) );
+
+  if(solver_params.use_initial_guess == 0) {
+    assign(phigh_up,Q_up,N); assign(phigh_dn,Q_dn,N);
+    assign(rhigh_up,Q_up,N); assign(rhigh_dn,Q_dn,N);
+    rho_dp = sourcesquarenorm;
+  } else {
+    // computing initial guess
+    f(rhigh_up,rhigh_dn,P_up,P_dn);
+    diff(rhigh_up,Q_up,rhigh_up,N); diff(rhigh_dn,Q_dn,rhigh_dn,N);
+    assign(phigh_up,rhigh_up,N); assign(phigh_dn,rhigh_dn,N);
+    rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) );
+  }
+
   assign_to_32(r_up,rhigh_up,N); assign_to_32(r_dn,rhigh_dn,N);
   rho_sp = rho_dp;
   assign_32(p_up,r_up,N); assign_32(p_dn,r_dn,N);
-  
+
   iter_in_sp += inner_loop(x_up, x_dn, p_up, p_dn, q_up, q_dn, r_up, r_dn, &rho_sp, delta, 
                            f32, (float)target_eps_sq, 
                            N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR);
diff --git a/solver/solver_params.h b/solver/solver_params.h
index fac350515..8c301e46d 100644
--- a/solver/solver_params.h
+++ b/solver/solver_params.h
@@ -95,7 +95,8 @@ typedef struct {
   CompressionType compression_type;
   SloppyPrecision sloppy_precision;
   ExternalInverter external_inverter;
-  
+
+  int use_initial_guess;  
 } solver_params_t;
 
 #endif
diff --git a/update_gauge.c b/update_gauge.c
index d45123b4a..b8d91d99e 100644
--- a/update_gauge.c
+++ b/update_gauge.c
@@ -85,7 +85,8 @@ void update_gauge(const double step, hamiltonian_field_t * const hf) {
       exposu3(&w,&deriv);
       restoresu3(&v,&w);
       _su3_times_su3(w, v, *z);
-      _su3_assign(*z, w);
+      restoresu3(&v,&w);
+      _su3_assign(*z, v);
     }
   }
 
diff --git a/update_momenta_fg.c b/update_momenta_fg.c
index df89f5de1..15bb92fa5 100644
--- a/update_momenta_fg.c
+++ b/update_momenta_fg.c
@@ -1,8 +1,7 @@
 /***********************************************************************
  *
- * Copyright (C) 2001 Martin Hasebusch
- *
- * some changes by C. Urbach 2002-2008,2012
+ * Copyright (C) 2017 Jacob Finkenrath
+ *               2018 Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -49,6 +48,76 @@
 #ifdef DDalphaAMG
 #include "DDalphaAMG_interface.h"
 #endif
+
+inline void calculate_fg(const double step_fg,
+                         hamiltonian_field_t * const hf){
+#ifdef TM_USE_OMP
+#define static
+#pragma omp parallel
+  {
+#endif
+
+  static su3 v,w;
+  su3 *z;
+  su3 *ztmp;
+  static su3adj deriv;
+  su3adj *Fm;
+
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int i = 0; i < VOLUME; i++) { 
+    for(int mu = 0; mu < 4; mu++){
+      /* Cope gauge field to be temporarily updated */
+      z = &hf->gaugefield[i][mu];
+      ztmp = &gauge_fg[i][mu];
+      _su3_assign(*ztmp,*z);  
+ 
+      /* Calculate approximated force gradient term and update temporary gauge field */
+      Fm = &hf->derivative[i][mu];
+      _zero_su3adj(deriv);
+      _su3adj_assign_const_times_su3adj(deriv, step_fg, *Fm);
+      exposu3(&w,&deriv);
+      restoresu3(&v,&w);
+      _su3_times_su3(w, v, *z);
+      restoresu3(&v,&w);
+      _su3_assign(*z, v);
+    }
+  }
+#ifdef TM_USE_OMP
+  } // OpenMP parallel section closing brace
+#undef static
+#endif
+}
+
+inline void fg_update_momenta_reset_gaugefield(const double step,
+                                               hamiltonian_field_t * const hf){
+#ifdef TM_USE_OMP
+#pragma omp parallel
+  {
+#endif
+  su3 *z;
+  su3 *ztmp;
+#ifdef TM_USE_OMP
+#pragma omp for
+#endif
+  for(int i = 0; i < VOLUME; i++) { 
+    for(int mu = 0; mu < 4; mu++){
+      /* Update momenta (the minus comes from an extra minus in trace_lambda)
+       and restore initial gauge field */
+      _su3adj_minus_const_times_su3adj(hf->momenta[i][mu], step, hf->derivative[i][mu]);
+  
+      z = &hf->gaugefield[i][mu];
+      ztmp = &gauge_fg[i][mu];
+      _su3_assign(*z,*ztmp);
+  
+    }
+  }
+#ifdef TM_USE_OMP
+  } // OpenMP parallel section closing brace
+#endif
+}
+
 /*******************************************************
  *
  * Temporarily updates the gauge field corresponding to 
@@ -56,7 +125,6 @@
  * the momenta
  *
  *******************************************************/
-
 void update_momenta_fg(int * mnllist, double step, const int no,
 		       hamiltonian_field_t * const hf, double step0) {
   double atime, etime;
@@ -64,39 +132,20 @@ void update_momenta_fg(int * mnllist, double step, const int no,
 #ifdef DDalphaAMG
   MG_update_gauge(0.0);
 #endif
+  if (g_exposu3_no_c == 0) init_exposu3();
 
-  /* #ifdef TM_USE_OMP
-     #define static
-     #pragma omp parallel
-     {
-     #endif
-  */
-
-  int i,mu;
-  double step_fg;
-  static su3 v,w;
-  su3 *z;
-  su3 *ztmp;
-  static su3adj deriv;
-  su3adj *Fm;
-
-  step_fg=-step0*step0/24;
-  /*
-     #ifdef _KOJAK_INST
-     #pragma pomp inst begin(updategauge)
-     #endif
-
-     #ifdef TM_USE_OMP
-     #pragma omp parallel for
-     #endif
-  */
+  double step_fg=-step0*step0/24;
 
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
   for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) {
     for(int mu=0;mu<4;mu++) {
       _zero_su3adj(hf->derivative[i][mu]);
     }
   }
 
+  // calculate derivatives to estimate force gradient
   for(int k = 0; k < no; k++) {
     if(monomial_list[ mnllist[k] ].derivativefunction != NULL) {
       monomial_list[ mnllist[k] ].derivativefunction(mnllist[k], hf);
@@ -106,31 +155,8 @@ void update_momenta_fg(int * mnllist, double step, const int no,
 #ifdef TM_USE_MPI
   xchange_deri(hf->derivative);
 #endif
-
-
-  /* #ifdef TM_USE_OMP
-     #pragma omp parallel for
-     #endif
-  */
-
-  for(i = 0; i < VOLUME; i++) { 
-    for(mu = 0; mu < 4; mu++){
-      /* Cope gauge field to be temporarily updated */
-      z = &hf->gaugefield[i][mu];
-      ztmp = &gauge_fg[i][mu];
-      _su3_assign(*ztmp,*z);  
- 
-      /* Calculate approximated force gradient term and update temporary gauge field */
-      Fm = &hf->derivative[i][mu];
-      _zero_su3adj(deriv);
-      _su3adj_assign_const_times_su3adj(deriv, step_fg, *Fm);
-      /*_su3adj_assign_const_times_su3adj(deriv, 0.0, *Fm);*/
-      exposu3(&w,&deriv);
-      restoresu3(&v,&w);
-      _su3_times_su3(w, v, *z);
-      _su3_assign(*z, w);
-    }
-  }
+  // estimate force gradient and propagate to gauge field
+  calculate_fg(step_fg, hf);
 
 #ifdef TM_USE_MPI
      /* for parallelization */
@@ -148,17 +174,10 @@ void update_momenta_fg(int * mnllist, double step, const int no,
    g_update_gauge_copy = 1;
    g_update_gauge_copy_32 = 1;
 
-
-   /* #ifdef TM_USE_OMP
-      #pragma omp parallel for
-      #endif
-   */
-   /* Calculate derivate based on the temporary updated
-      gauge field U'=ztmp:
-      1) Set derivative to zero
-      2) Recalcuate derivate
-   */
-    
+  // calculate forces with force-gradient updated gauge field
+#ifdef TM_USE_OMP
+#pragma omp parallel for
+#endif
   for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) {
     for(int mu=0;mu<4;mu++) {
       _zero_su3adj(hf->derivative[i][mu]);
@@ -174,25 +193,10 @@ void update_momenta_fg(int * mnllist, double step, const int no,
 #ifdef TM_USE_MPI
   xchange_deri(hf->derivative);
 #endif
-
-  for(i = 0; i < VOLUME; i++) { 
-    for(mu = 0; mu < 4; mu++){
-      /* Update momenta (the minus comes from an extra minus in trace_lambda)
-	 and restore initial gauge field */
-      _su3adj_minus_const_times_su3adj(hf->momenta[i][mu], step, hf->derivative[i][mu]);
-
-      z = &hf->gaugefield[i][mu];
-      ztmp = &gauge_fg[i][mu];
-      _su3_assign(*z,*ztmp);
-
-    }
-  }
-
-  /* #ifdef TM_USE_OMP
-     } /* OpenMP parallel closing brace /
-     #endif
-  */
   
+  // and finally update the momenta and reset the gauge field 
+  fg_update_momenta_reset_gaugefield(step, hf);
+
 #ifdef TM_USE_MPI
   /* for parallelization */
   xchange_gauge(hf->gaugefield);
@@ -218,9 +222,4 @@ void update_momenta_fg(int * mnllist, double step, const int no,
     printf("# Time gauge update: %e s\n", etime-atime); 
   } 
   return;
-
-  /* #ifdef _KOJAK_INST
-     #pragma pomp inst end(updategauge)
-     #endif
-  */
 }
diff --git a/xchange/xchange_deri.c b/xchange/xchange_deri.c
index e11441c1a..7edab924c 100644
--- a/xchange/xchange_deri.c
+++ b/xchange/xchange_deri.c
@@ -38,7 +38,7 @@
 #include "su3adj.h"
 #include "xchange_deri.h"
 
-inline void addup_ddummy(su3adj** const df, const int ix, const int iy) {
+static inline void addup_ddummy(su3adj** const df, const int ix, const int iy) {
   for(int mu = 0; mu < 4; mu++) {
     df[ix][mu].d1 += ddummy[iy][mu].d1;
     df[ix][mu].d2 += ddummy[iy][mu].d2;