Merge remote-tracking branch 'origin/quda_work' into quda_work_eigens…

…olver
etmc · Jun 28, 2023 · 9c27ed5 · 9c27ed5
2 parents 721f3ba + 6ac0c6e
commit 9c27ed5
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 21 deletions.
diff --git a/default_input_values.h b/default_input_values.h
@@ -210,7 +210,7 @@
 #define _default_subprocess_flag 0
 #define _default_lowmem_flag 0
 
-#define _default_g_barrier_monomials_convergence 0
+#define _default_g_barrier_monomials_convergence 1
 
 /* default input values for QUDA interface */
 /* These follow the recommendations of https://github.com/lattice/quda/wiki/Multigrid-Solver */

diff --git a/doc/quda.tex b/doc/quda.tex
@@ -384,7 +384,7 @@ \subsubsection{QUDA-MG interface}
   \item{ \texttt{MGEigSolverUsePolynomialAcceleration}: Whether or not to use polynomial acceleration in the eigensolver on a per-level basis. (comma-separated list of booleans, default: \texttt{yes} everywhere)}
   \item{ \texttt{MGEigSolverPolynomialDegree}: Degree of the polynomial used for polynomial acceleration of the eigensolver, specified on a per-level basis. This will be ignored except if \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on a given level. Values between $50$ to $100$ seem to work well. (comma-separated list of postive integers, default: $100$ on all levels)}
   \item{ \texttt{MGEigSolverPolyMin}: Smallest eigenvalue to suppress via polynomial acceleration. It should not be set too low and only lowered step by step from its default value. Has a large impact on the rate of convergence of the eigensolver when set correctly. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on the corresponding level. (comma-separated list of positive real numbers, default: $1.0$ on all levels)}
-  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 10\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 20\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
 \end{itemize}
 
 A typical coarse-grid-deflated setup might look something like:
@@ -421,18 +421,26 @@ \subsubsection{QUDA-MG interface}
   MGEigSolverType = tr_lanczos, tr_lanczos, tr_lanczos
   MGEigSolverSpectrum = smallest_real, smallest_real, smallest_real
   MGEigPreserveDeflationSubspace = yes
-  MGEigSolverNumberOfVectors = 24, 32, 1024
-  MGEigSolverKrylovSubspaceSize = 24, 32, 2048
+  MGEigSolverNumberOfVectors = 0, 0, 1024
+  MGEigSolverKrylovSubspaceSize = 0, 0, 3072
   MGEigSolverRequireConvergence = no, no, yes
-  MGEigSolverMaxRestarts = 100, 100, 100
+  MGEigSolverMaxRestarts = 0, 0, 100
   MGEigSolverTolerance = 1e-4, 1e-4, 1e-4
   MGEigSolverUseNormOp  = no, no, yes
   MGEigSolverUseDagger = no, no, no
   MGEigSolverUsePolynomialAcceleration = no, no, yes
-  MGEigSolverPolynomialDegree = 100, 100, 100
-  MGEigSolverPolyMin = 0.6, 0.6, 0,6
-  MGEigSolverPolyMax = 8.0, 8.0, 8.0
-  MGCoarseSolverCABasisSize = 8, 8, 8
+  MGEigSolverPolynomialDegree = 0, 0, 100
+  MGEigSolverPolyMin = 0.0, 0.0, 0.01
+  MGEigSolverPolyMax = 0.0, 0.0, 3.6
+  MGCoarseSolverCABasisSize = 4, 4, 4
 EndExternalInverter
 \end{verbatim}
 
+In order to determine approppriate values for \texttt{MGEigSolverPolyMin} and \texttt{MGEigSolverPolyMax}, one should run the eigensolver once without polynomial acceleration (\texttt{MGEigSolverUsePolynomialAcceleration = no}) and run tmLQCD at \texttt{DebugLevel = 3} to trigger the eigenvalues to be printed by the eigensolver.
+Then, setting \texttt{MGEigSolverSpectrum = largest\_real}, \texttt{MGEigSolverNumberOfVectors = 1} and \texttt{MGEigSolverKrylovSubspaceSize = 16} (all on the coarsest grid, the third number in the example above), one can easily obtain the largest eigenvalue of the coarse grid operator for a particular configuration or set of configurations.
+Increasing this number by 20\% or so gives a good value for \texttt{MGEigSolverPolyMax}.
+Subsequently, setting \texttt{MGEigSolverSpectrum = smallest\_real}, \texttt{MGEigSolverNumberOfVectors = 0, 0, 1024}, \texttt{MGEigSolverKrylovSubspaceSize = 0, 0, 3072} will give the requisite 1024 smallest eigenvalues.
+Noting the largest of these, the next largest order of magnitude can be used for \texttt{MGEigSolverPolyMin}.
+In other words, if the largest of these smallest eigenvalues is $4\cdot10^{-3}$, for example, then \texttt{MGEigSolverPolyMin} can be set to 0.01.
+This ensures that the desired (smallest) part of the spectrum is smaller than \texttt{MGEigSolverPolyMin} and that the entire spectrum is contained in the range up to \texttt{MGEigSolverPolyMax}.
+After this, polynomial acceleration can be enabled, which should reduce setup time significantly.
diff --git a/quda_interface.c b/quda_interface.c
@@ -117,6 +117,8 @@
 #include "tm_debug_printf.h"
 #include "phmc.h"
 #include "quda_gauge_paths.inc"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 // nstore is generally like a gauge id, for measurements it identifies the gauge field
 // uniquely 
@@ -1842,15 +1844,18 @@ void _setMGInvertParam(QudaInvertParam * mg_inv_param, const QudaInvertParam * c
   mg_inv_param->cuda_prec = inv_param->cuda_prec;
   mg_inv_param->cuda_prec_sloppy = inv_param->cuda_prec_sloppy;
   mg_inv_param->cuda_prec_refinement_sloppy = inv_param->cuda_prec_refinement_sloppy;
-  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_precondition;
-  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_eigensolver;
 
   mg_inv_param->clover_cpu_prec = inv_param->clover_cpu_prec;
   mg_inv_param->clover_cuda_prec = inv_param->clover_cuda_prec;
   mg_inv_param->clover_cuda_prec_sloppy = inv_param->clover_cuda_prec_sloppy;
   mg_inv_param->clover_cuda_prec_refinement_sloppy = inv_param->clover_cuda_prec_refinement_sloppy;
-  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_precondition;
-  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_eigensolver;
+
+  // it seems that the MG-internal preconditioner and eigensolver need to be
+  // consistent with sloppy precision
+  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_sloppy;
+  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_sloppy;
 
   mg_inv_param->clover_order = inv_param->clover_order;
   mg_inv_param->gcrNkrylov = inv_param->gcrNkrylov;
@@ -2046,7 +2051,9 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
 
       mg_eig_param[level].n_ev = quda_input.mg_eig_nEv[level];
       mg_eig_param[level].n_kr = quda_input.mg_eig_nKr[level];
-      mg_eig_param[level].n_conv = quda_input.mg_n_vec[level];
+      mg_eig_param[level].n_conv = quda_input.mg_eig_nEv[level]; // require convergence of all eigenvalues
+      mg_eig_param[level].n_ev_deflate = mg_eig_param[level].n_conv; // deflate all converged eigenvalues
+      // TODO expose this setting: mg_eig_param[level].batched_rotate = 128;
       mg_eig_param[level].require_convergence = quda_input.mg_eig_require_convergence[level];
 
       mg_eig_param[level].tol = quda_input.mg_eig_tol[level];
@@ -2236,9 +2243,16 @@ int invert_eo_degenerate_quda(spinor * const out,
                                                   rel_prec, even_odd_flag, solver_params,
                                                   sloppy_precision, compression, QpQm);
         if (ret_value >= max_iter) {
+          char outname[200];
+          snprintf(outname, 200, "conf_mg_refresh_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+          paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(
+              measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+          int status = write_gauge_field(outname, 64, xlfInfo);
+          free(xlfInfo);
+
           char errmsg[200];
           snprintf(errmsg, 200, "QUDA-MG solver failed to converge in %d iterations even after forced setup refresh. Terminating!",
-                   max_iter);
+                 max_iter);
           fatal_error(errmsg, __func__);
           return -1;
         } else {

diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
@@ -82,13 +82,24 @@
 #endif
 #include "fatal_error.h"
 
-#include <io/params.h>
-#include <io/spinor.h>
+#include "io/params.h"
+#include "io/spinor.h"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
 #endif
 
+void solve_fail_write_config_and_abort(const char * const solver) {
+  char outname[200];
+  snprintf(outname, 200, "conf_monomial_solve_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+  paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+  int status = write_gauge_field(outname, 64, xlfInfo);
+  free(xlfInfo);
+  fatal_error("Error: solver reported -1 iterations.", solver);
+}
+
 int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params,
                      const int max_iter, double eps_sq, const int rel_prec, 
                      const int N, matrix_mult f, int solver_type){
@@ -216,7 +227,7 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_degenerate");
+    solve_fail_write_config_and_abort("solve_degenerate");
   }
 
   return (iteration_count);
@@ -425,7 +436,7 @@ int solve_mms_tm(spinor ** const P, spinor * const Q,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_tm");
+    solve_fail_write_config_and_abort("solve_mms_tm");
   }
 
   return(iteration_count);
@@ -671,7 +682,7 @@ int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd");
+    solve_fail_write_config_and_abort("solve_mms_nd");
   }
 
   return (iteration_count);
@@ -726,7 +737,7 @@ int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd_plus");
+    solve_fail_write_config_and_abort("solve_mms_nd_plus");
   }
 
   return iteration_count;