From ccbb1e82af78a734101c18f5bb2a54b32c116c73 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Thu, 12 Dec 2024 00:24:02 -0500
Subject: [PATCH] GaussianConvolution1D supports range restriction on kernel

---
 src/madness/mra/convolution1d.h | 151 ++++++++++++++++++++++++--------
 1 file changed, 114 insertions(+), 37 deletions(-)
diff --git a/src/madness/mra/convolution1d.h b/src/madness/mra/convolution1d.h
index 9ed2d28b553..dc96b67ccc5 100644
--- a/src/madness/mra/convolution1d.h
+++ b/src/madness/mra/convolution1d.h
@@ -41,10 +41,11 @@
 #include <madness/mra/twoscale.h>
 #include <madness/tensor/aligned.h>
 #include <madness/tensor/tensor_lapack.h>
+#include <madness/misc/kahan_accumulator.h>
 #include <algorithm>
 
 /// \file mra/convolution1d.h
-/// \brief Compuates most matrix elements over 1D operators (including Gaussians)
+/// \brief Computes most matrix elements over 1D operators (including Gaussians)
 
 /// \ingroup function
 
@@ -260,28 +261,38 @@ namespace madness {
         int k;          ///< Wavelet order
         int npt;        ///< Number of quadrature points (is this used?)
         int maxR;       ///< Number of lattice translations for sum
+        double bloch_k;  ///< k in exp(i k R) Bloch phase factor folded into lattice sum
+        unsigned int D;  ///< kernel range limited to [-D/2,D/2] (in simulation cell units), useful for finite-range convolutions with periodic functions; for infinite-range use lattice summation (maxR > 0)
         Tensor<double> quad_x;
         Tensor<double> quad_w;
         Tensor<double> c;
         Tensor<double> hgT, hg;
         Tensor<double> hgT2k;
-        double bloch_k;  ///< k in exp(i k R) Bloch phase factor folded into lattice sum
 
         mutable SimpleCache<Tensor<Q>, 1> rnlp_cache;
         mutable SimpleCache<Tensor<Q>, 1> rnlij_cache;
         mutable SimpleCache<ConvolutionData1D<Q>, 1> ns_cache;
         mutable SimpleCache<ConvolutionData1D<Q>, 2> mod_ns_cache;
 
+        static unsigned int maxD() { return std::numeric_limits<unsigned int>::max(); }
+        bool lattice_summed() const { return maxR != 0; }
+        bool range_limited() const { return D != maxD(); }
+
         virtual ~Convolution1D() {};
 
-        Convolution1D(int k, int npt, int maxR, double bloch_k = 0.0)
+        Convolution1D(int k, int npt, int maxR,
+                      double bloch_k = 0.0,
+                      unsigned int D = maxD())
                 : k(k)
                 , npt(npt)
                 , maxR(maxR)
                 , quad_x(npt)
                 , quad_w(npt)
                 , bloch_k(bloch_k)
+                , D(D)
         {
+            if (range_limited()) MADNESS_CHECK(!lattice_summed());
+
             auto success = autoc(k,&c);
             MADNESS_CHECK(success);
 
@@ -306,22 +317,39 @@ namespace madness {
 
         /// Returns true if the block of rnlp is expected to be small including periodicity
         bool get_issmall(Level n, Translation lx) const {
-            if (maxR == 0) {
-                return issmall(n, lx);
+          if (lattice_summed()) {
+            Translation twon = Translation(1) << n;
+            for (int R = -maxR; R <= maxR; ++R) {
+              if (!issmall(n, R * twon + lx))
+                return false;
             }
+            return true;
+          } else { // !lattice_summed
+            if (!range_limited())
+              return issmall(n, lx);
             else {
-                Translation twon = Translation(1)<<n;
-                for (int R=-maxR; R<=maxR; ++R) {
-                    if (!issmall(n, R*twon+lx)) return false;
-                }
-                return true;
+              return outside_the_range(n, lx) || issmall(n, lx);
+            }
+          }
+        }
+
+        /// @return true if \p lx is outside of the kernel range limit \p D
+        bool outside_the_range(Level n, Translation lx) const {
+          bool result;
+          if (range_limited()) {
+            if (n == 0) {
+              result = lx > 0 || lx < -1;
+            } else { // n > 0
+              if (lx >= 0)
+                result = (1 << (n - 1)) * Translation(D) <= lx;
+              else
+                result = (-(1 << (n - 1)) * Translation(D)) > lx;
             }
+          }
+          return result;
         }
 
         /// Returns the level for projection
-        //virtual Level natural_level() const {
-        //    return 13;
-        //}
         virtual Level natural_level() const {return 13;}
 
         /// Computes the transition matrix elements for the convolution for n,l
@@ -333,6 +361,7 @@ namespace madness {
         /// This is computed from the matrix elements over the correlation
         /// function which in turn are computed from the matrix elements
         /// over the double order legendre polynomials.
+        /// \note if `this->range_limited()==true`, `θ(D/2 - |x-y|) K(x-y)` is used as the kernel
         const Tensor<Q>& rnlij(Level n, Translation lx, bool do_transpose=false) const {
             const Tensor<Q>* p=rnlij_cache.getptr(n,lx);
             if (p) return *p;
@@ -517,7 +546,7 @@ namespace madness {
             else {
                 // PROFILE_BLOCK(Convolution1Drnlp); // Too fine grain for routine profiling
 
-                if (maxR > 0) {
+                if (lattice_summed()) {
                     Translation twon = Translation(1)<<n;
                     r = Tensor<Q>(2*k);
                     for (int R=-maxR; R<=maxR; ++R) {
@@ -631,7 +660,7 @@ namespace madness {
             }
         }
 
-        virtual Level natural_level() const {return op.natural_level();}
+        virtual Level natural_level() const final {return op.natural_level();}
 
         struct Shmoo {
             typedef Tensor<Q> returnT;
@@ -654,12 +683,12 @@ namespace madness {
             }
         };
 
-        Tensor<Q> rnlp(Level n, Translation lx) const {
+        Tensor<Q> rnlp(Level n, Translation lx) const final {
             return adq1(lx, lx+1, Shmoo(n, lx, this), 1e-12,
                         this->npt, this->quad_x.ptr(), this->quad_w.ptr(), 0);
         }
 
-        bool issmall(Level n, Translation lx) const {
+        bool issmall(Level n, Translation lx) const final {
             if (lx < 0) lx = 1 - lx;
             // Always compute contributions to nearest neighbor coupling
             // ... we are two levels below so 0,1 --> 0,1,2,3 --> 0,...,7
@@ -696,8 +725,9 @@ namespace madness {
         const int m;            ///< Order of derivative (0, 1, or 2 only)
 
         explicit GaussianConvolution1D(int k, Q coeff, double expnt,
-        		int m, bool periodic, double bloch_k = 0.0)
-            : Convolution1D<Q>(k,k+11,maxR(periodic,expnt),bloch_k)
+        		int m, bool periodic, double bloch_k = 0.0,
+                        unsigned int D = Convolution1D<Q>::maxD())
+            : Convolution1D<Q>(k,k+11,maxR(periodic,expnt),bloch_k, D)
             , coeff(coeff)
             , expnt(expnt)
             , natlev(Level(0.5*log(expnt)/log(2.0)+1))
@@ -715,7 +745,7 @@ namespace madness {
 
         virtual ~GaussianConvolution1D() {}
 
-        virtual Level natural_level() const {
+        virtual Level natural_level() const final {
             return natlev;
         }
 
@@ -736,12 +766,37 @@ namespace madness {
         /// \code
         /// beta = alpha * 2^(-2*n)
         /// \endcode
-        Tensor<Q> rnlp(Level n, Translation lx) const {
+        Tensor<Q> rnlp(Level n, const Translation lx) const final {
             int twok = 2*this->k;
             Tensor<Q> v(twok);       // Can optimize this away by passing in
-
-            Translation lkeep = lx;
-            if (lx<0) lx = -lx-1;
+            KahanAccumulator<Q> v_accumulator[twok];
+            constexpr bool use_kahan = false;  // change to true to use Kahan accumulator
+
+            // if outside the range, early return, else update the integration limits
+            std::pair<double, double> integration_limits{0,1};
+            if (this->range_limited()) {
+              const auto two_to_nm1 = (1ul << n) * 0.5;
+              if (lx < 0) {
+                integration_limits = std::make_pair(
+                    std::min(std::max(-two_to_nm1 * this->D - lx, 0.), 1.), 1.);
+              } else {
+                integration_limits = std::make_pair(
+                    0., std::max(std::min(two_to_nm1 * this->D - lx, 1.), 0.));
+              }
+              // early return if empty integration range (this indicates that
+              // the range restriction makes the kernel zero everywhere in the box)
+              if (integration_limits.first == integration_limits.second) {
+                MADNESS_ASSERT(this->outside_the_range(n, lx));
+                return v;
+              }
+              else {
+                MADNESS_ASSERT(!this->outside_the_range(n, lx));
+              }
+            }
+            // integration range lower bound, upper bound, length
+            const auto x0 = integration_limits.first;
+            const auto x1 = integration_limits.second;
+            const auto L = x1 - x0;
 
             /* Apply high-order Gauss Legendre onto subintervals
 
@@ -780,7 +835,7 @@ namespace madness {
             double h = 1.0/sqrt(beta);  // 2.0*sqrt(0.5/beta);
             long nbox = long(1.0/h);
             if (nbox < 1) nbox = 1;
-            h = 1.0/nbox;
+            h = L/nbox;
 
             // Find argmax such that h*scaledcoeff*exp(-argmax)=1e-22 ... if
             // beta*xlo*xlo is already greater than argmax we can neglect this
@@ -793,10 +848,29 @@ namespace madness {
             else if (m == 2) sch *= expnt*expnt;
             double argmax = std::abs(log(1e-22/sch)); // perhaps should be -log(1e-22/sch) ?
 
-            for (long box=0; box<nbox; ++box) {
-                double xlo = box*h + lx;
-                if (beta*xlo*xlo > argmax) break;
-                for (long i=0; i<this->npt; ++i) {
+            // to screen need to iterate over boxes in the order of decreasing kernel values
+            const bool left_to_right = lx >= 0;
+            // if going left-to-right, start at left, else at right
+            const double xstartedge = left_to_right ? x0+lx : lx + 1;
+
+            // with oscillatory integrands the heuristic for reducing roundoff
+            // is to sum from large to small, i.e. proceed in same direction as the order of boxes
+            // WARNING: the grid points in quad_{x,w} are in order of decreasing x!
+            // hence decrement grid point indices for left_to_right, increment otherwise
+            const long first_pt = left_to_right ? this->npt-1: 0;
+            const long sentinel_pt = left_to_right ? -1 : this->npt;
+            const auto next_pt = [lx, left_to_right](auto i) { return left_to_right ? i-1 : i+1; };
+
+            double xlo = left_to_right ? xstartedge : xstartedge-h;
+            double xhi;
+            for (long box=0; box!=nbox; ++box, xlo = (left_to_right ? xhi : xlo-h)) {
+
+                // can ignore this and rest of boxes if the Gaussian has decayed enough at the side of the box closest to the origin
+                xhi=xlo+h;
+                const auto xabs_min = std::min(std::abs(xhi),std::abs(xlo));
+                if (beta*xabs_min*xabs_min > argmax) break;
+
+                for (long i=first_pt; i!=sentinel_pt; i=next_pt(i)) {
 #ifdef IBMXLC
                     double phix[80];
 #else
@@ -814,22 +888,25 @@ namespace madness {
                     }
 
                     legendre_scaling_functions(xx-lx,twok,phix);
-                    for (long p=0; p<twok; ++p) v(p) += ee*phix[p];
+                    for (long p=0; p<twok; ++p) {
+                      if constexpr (use_kahan)
+                        v_accumulator[p] += ee * phix[p];
+                      else
+                        v(p) += ee * phix[p];
+                    }
                 }
             }
 
-            if (lkeep < 0) {
-                /* phi[p](1-z) = (-1)^p phi[p](z) */
-                if (m == 1)
-                    for (long p=0; p<twok; ++p) v(p) = -v(p);
-                for (long p=1; p<twok; p+=2) v(p) = -v(p);
+            if constexpr (use_kahan) {
+              for (long p = 0; p < twok; ++p)
+                v(p) = static_cast<Q>(v_accumulator[p]);
             }
 
             return v;
-        };
+        }
 
         /// Returns true if the block is expected to be small
-        bool issmall(Level n, Translation lx) const {
+        bool issmall(Level n, Translation lx) const final {
             double beta = expnt * pow(0.25,double(n));
             Translation ll;
             if (lx > 0)