From 657e7486132b9b49a3187c73fd36a048bb038bea Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Mon, 16 Jan 2023 17:18:46 +0100
Subject: [PATCH 01/41] Add error checking to CUDA version of getNeighborPairs

---
 src/pytorch/neighbors/TestNeighbors.py        |  2 +-
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 28 +++++++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 2b1365ab..7ec159ae 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -135,7 +135,7 @@ def test_neighbor_grads(dtype, num_atoms, grad):
 # The following test is only run on the CPU.  Running it on the GPU triggers a
 # CUDA assertion, which causes all tests run after it to fail.
 
-@pytest.mark.parametrize('device', ['cpu'])
+@pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_too_many_neighbors(device, dtype):
 
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 2d820a4a..63ea9454 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -26,6 +26,8 @@ template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t
 template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
 template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };
 
+__device__ __managed__ int32_t tooManyNeighborsErrorFlag; //Error flag for forward_kernel
+
 template <typename scalar_t> __global__ void forward_kernel(
     const int32_t num_all_pairs,
     const Accessor<scalar_t, 2> positions,
@@ -64,7 +66,12 @@ template <typename scalar_t> __global__ void forward_kernel(
     if (distance2 > cutoff2) return;
 
     const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1);
-    assert(i_pair < neighbors.size(1));
+    //If the maximum number of neighbours is surpassed encode the
+    //number of pairs found in a flag and exit
+    if(i_pair >= neighbors.size(1)){
+      atomicMin(&tooManyNeighborsErrorFlag, -i_pair);
+      return;
+    }
 
     neighbors[0][i_pair] = row;
     neighbors[1][i_pair] = column;
@@ -135,11 +142,19 @@ public:
         const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32));
         const Tensor deltas = full({num_pairs, 3}, NAN, options);
         const Tensor distances = full(num_pairs, NAN, options);
-
+	//Advice CUDA on expected usage of the error flag
+	//cudaStreamAttachMemAsync(stream, &tooManyNeighborsErrorFlag);
+	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
+	 	      cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
+	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
+		      cudaMemAdviseSetReadMostly, 0);
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
             const CUDAStreamGuard guard(stream);
+	    tooManyNeighborsErrorFlag = 0;
             const scalar_t cutoff_ = cutoff.to<scalar_t>();
             TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");
+	    cudaEvent_t event;
+	    cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync);
             forward_kernel<<<num_blocks, num_threads, 0, stream>>>(
                 num_all_pairs,
                 get_accessor<scalar_t, 2>(positions),
@@ -151,6 +166,13 @@ public:
                 get_accessor<scalar_t, 2>(deltas),
                 get_accessor<scalar_t, 1>(distances),
                 get_accessor<scalar_t, 2>(box_vectors));
+	    cudaEventRecord(event, stream);
+	    cudaEventSynchronize(event);
+	    //Check the error flag
+	    TORCH_CHECK(tooManyNeighborsErrorFlag == 0, "Some particle has too many neighbours, found " +
+			std::to_string(-tooManyNeighborsErrorFlag) + " but max is " +
+			std::to_string(max_num_neighbors.toInt()));
+	    cudaEventDestroy(event);
         });
 
         ctx->save_for_backward({neighbors, deltas, distances});
@@ -197,4 +219,4 @@ TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
             const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, box_vectors);
             return make_tuple(results[0], results[1], results[2]);
     });
-}
\ No newline at end of file
+}

From 53395568cb96684a05027ad69cf0098172a1bf10 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 19 Jan 2023 14:55:09 +0100
Subject: [PATCH 02/41] Add a new bool optional parameter to getNeighborPairs,
 setting it to true will force the function to synchronize and throw an
 exception if some error was found, so it can be catched. The default will
 throw the error asynchronously, which will crash the program. In both cases a
 meaningful message is printed.

---
 src/pytorch/neighbors/TestNeighbors.py        |  4 +-
 src/pytorch/neighbors/getNeighborPairs.py     | 13 ++-
 src/pytorch/neighbors/getNeighborPairsCPU.cpp |  9 +-
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 93 ++++++++++++++-----
 src/pytorch/neighbors/neighbors.cpp           |  4 +-
 5 files changed, 91 insertions(+), 32 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 7ec159ae..2391f648 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -145,9 +145,11 @@ def test_too_many_neighbors(device, dtype):
     # 4 points result into 6 pairs, but there is a storage just for 4.
     with pytest.raises(RuntimeError):
         positions = pt.zeros((4, 3,), device=device, dtype=dtype)
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1)
+        # Omitting check_errors makes the exception non-catchable
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
         pt.cuda.synchronize()
 
+
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_periodic_neighbors(device, dtype):
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 55d20d48..5b8a0f29 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -2,7 +2,7 @@
 from typing import Optional, Tuple
 
 
-def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
     '''
     Returns indices and distances of atom pairs within a given cutoff distance.
 
@@ -44,6 +44,12 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
         The vectors defining the periodic box.  This must have shape `(3, 3)`,
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
+    check_errors: bool, optional
+        If set to True the function is guaranteed to throw if an error
+        is encountered, synchronizing if necessary.
+        If set to False, an error arising in this function might crash
+        the code at some point after calling it.
+        Defaults to False.
 
     Returns
     -------
@@ -75,6 +81,8 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
     The CUDA implementation returns the atom pairs in non-determinist order,
     if `max_num_neighbors > 0`.
 
+    The check_errors argument is forced to False if CUDA graphs are used.
+
     Examples
     --------
     >>> import torch as pt
@@ -119,8 +127,9 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
              [nan, nan, nan],
              [nan, nan, nan]]),
      tensor([1., 1., nan, nan, nan, nan]))
+
     '''
 
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
-    return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors)
\ No newline at end of file
+    return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors)
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index 19dfa7d8..5d6e04e2 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -96,5 +96,10 @@ static tuple<Tensor, Tensor, Tensor> forward(const Tensor& positions,
 }
 
 TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
-    m.impl("getNeighborPairs", &forward);
-}
\ No newline at end of file
+  m.impl("getNeighborPairs",
+	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
+	      const Tensor& box_vectors, const bool &checkErrors){
+	   //The checkErrors flag is ignored, this function always checks for errors synchronously
+	   return forward(positions, cutoff, max_num_neighbors, box_vectors);
+	 });
+}
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 63ea9454..49cb7e2d 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -1,5 +1,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include <exception>
+#include <stdexcept>
 #include <torch/extension.h>
 #include <algorithm>
 #include <tuple>
@@ -104,13 +106,47 @@ template <typename scalar_t> __global__ void backward_kernel(
     atomicAdd(&grad_positions[i_atom][i_comp], grad);
 }
 
+namespace detail{
+  static std::exception_ptr tooManyNeighborsException = nullptr;
+  // Checks the  too many  neighbors flag and  stores an  exception if
+  // necessary to detail::tooManyNeighborsException.  This function is
+  // intended to be launched via cudaLaunchHostFunc.
+  //data  is a  void pointer  to a  std::tuple<int,bool>, storing  the
+  // maximum number  of neighbors and  whether to throw  an uncatchable
+  // exception here (false) or store it for later (true).
+  void CUDART_CB checkTooManyNeighbors(void *data){
+    int max_num_neighbors;
+    bool checkErrors;
+    std::tie(max_num_neighbors, checkErrors) = *static_cast<std::tuple<int, bool>*>(data);
+    // An exception thrown  in a stream callback is  not catchable (it
+    // runs in another thread), so we store it in an exception_ptr for
+    // it  to be  processed sometime  later in  the main  thread.  For
+    // performance   reasons,    the   exception   is    thrown   here
+    // asynchronously if the checkErrors flag is set to false
+    try{
+      const int tooMan = tooManyNeighborsErrorFlag;
+      TORCH_CHECK(tooMan == 0,
+		  "Some particle has too many neighbors, found " +
+		  std::to_string(-tooMan) + " but max is " +
+		  std::to_string(max_num_neighbors));
+    }
+    catch(...){
+      if(not checkErrors)
+	throw;
+      else
+	tooManyNeighborsException = std::current_exception();
+    }
+  }
+}
+
 class Autograd : public Function<Autograd> {
 public:
     static tensor_list forward(AutogradContext* ctx,
                                const Tensor& positions,
                                const Scalar& cutoff,
                                const Scalar& max_num_neighbors,
-                               const Tensor& box_vectors) {
+                               const Tensor& box_vectors,
+			       bool checkErrors) {
 
         TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
@@ -143,18 +179,15 @@ public:
         const Tensor deltas = full({num_pairs, 3}, NAN, options);
         const Tensor distances = full(num_pairs, NAN, options);
 	//Advice CUDA on expected usage of the error flag
-	//cudaStreamAttachMemAsync(stream, &tooManyNeighborsErrorFlag);
 	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
-	 	      cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
+		      cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
 	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
 		      cudaMemAdviseSetReadMostly, 0);
+	tooManyNeighborsErrorFlag = 0;
+	const CUDAStreamGuard guard(stream);
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
-            const CUDAStreamGuard guard(stream);
-	    tooManyNeighborsErrorFlag = 0;
-            const scalar_t cutoff_ = cutoff.to<scalar_t>();
-            TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");
-	    cudaEvent_t event;
-	    cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync);
+	  const scalar_t cutoff_ = cutoff.to<scalar_t>();
+	  TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");
             forward_kernel<<<num_blocks, num_threads, 0, stream>>>(
                 num_all_pairs,
                 get_accessor<scalar_t, 2>(positions),
@@ -166,18 +199,26 @@ public:
                 get_accessor<scalar_t, 2>(deltas),
                 get_accessor<scalar_t, 1>(distances),
                 get_accessor<scalar_t, 2>(box_vectors));
-	    cudaEventRecord(event, stream);
-	    cudaEventSynchronize(event);
-	    //Check the error flag
-	    TORCH_CHECK(tooManyNeighborsErrorFlag == 0, "Some particle has too many neighbours, found " +
-			std::to_string(-tooManyNeighborsErrorFlag) + " but max is " +
-			std::to_string(max_num_neighbors.toInt()));
-	    cudaEventDestroy(event);
         });
-
-        ctx->save_for_backward({neighbors, deltas, distances});
+	//Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
+	cudaHostFn_t h_fn = detail::checkTooManyNeighbors;
+	static std::tuple<int, bool> h_fn_data;
+	h_fn_data = {max_num_neighbors_, checkErrors};
+	cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
+	//Errors are thrown as exceptions  asynchronously and in a way
+	//compatible with CUDA graphs.   However, this way of throwing
+	//an exception  makes it  uncatchable, crashing the  code.  If
+	//the   checkErrors  flag   is   set  to   true  an   explicit
+	//synchronization barrier  here forces to throw  the exception
+	//from the main thread, making  it catchable at the expense of
+	//a performance penalty each time the function is called.
+	if(checkErrors){
+	  cudaStreamSynchronize(stream);
+	  if(detail::tooManyNeighborsException)
+	    std::rethrow_exception(detail::tooManyNeighborsException);
+	}
+	ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
-
         return {neighbors, deltas, distances};
     }
 
@@ -209,14 +250,16 @@ public:
                 get_accessor<scalar_t, 2>(grad_positions));
         });
 
-        return {grad_positions, Tensor(), Tensor(), Tensor()};
+        return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor()};
       }
 };
 
 TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
-    m.impl("getNeighborPairs",
-        [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, const Tensor& box_vectors){
-            const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, box_vectors);
-            return make_tuple(results[0], results[1], results[2]);
-    });
+  m.impl("getNeighborPairs",
+	 [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
+	    const Tensor& box_vectors, const bool &checkErrors){
+	   const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors,
+						       box_vectors, checkErrors);
+	   return make_tuple(results[0], results[1], results[2]);
+	 });
 }
diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp
index d8dd5c5b..4614160d 100644
--- a/src/pytorch/neighbors/neighbors.cpp
+++ b/src/pytorch/neighbors/neighbors.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
 
 TORCH_LIBRARY(neighbors, m) {
-    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors) -> (Tensor neighbors, Tensor deltas, Tensor distances)");
-}
\ No newline at end of file
+    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances)");
+}

From 8c1395269ff6bf8535959a3b8fd7257da162ece1 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 19 Jan 2023 15:08:41 +0100
Subject: [PATCH 03/41] Remove unnecessarily static variable

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 49cb7e2d..5a229fb3 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -202,8 +202,7 @@ public:
         });
 	//Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
 	cudaHostFn_t h_fn = detail::checkTooManyNeighbors;
-	static std::tuple<int, bool> h_fn_data;
-	h_fn_data = {max_num_neighbors_, checkErrors};
+	std::tuple<int, bool> h_fn_data = {max_num_neighbors_, checkErrors};
 	cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
 	//Errors are thrown as exceptions  asynchronously and in a way
 	//compatible with CUDA graphs.   However, this way of throwing

From 928f12356683d58d8145176e195ec915fcefc75e Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 3 Mar 2023 12:33:22 +0100
Subject: [PATCH 04/41] Change the error handling of getNeighborPairs. - Add a
 new optional flag, sync_exceptions on top of the current check_errors. -
 Three behaviors are possible:   1. Default (both false). Operation is
 CUDA-graph compatible and an   uncatchable exception is thrown in case of
 number of pairs being too   high.   2. check_errors=True. Operation is
 CUDA-graph compatible. No   exception is thrown and the number of found pairs
 is returned, which   can be higher than max_number_pairs.   3.
 check_errors=False and sync_exceptions=True. Operation is NOT   CUDA-graph
 compatible. The operation synchronizes to check for   errors and throws a
 catchable exception if necessary.

---
 src/pytorch/neighbors/TestNeighbors.py        | 18 +++-
 src/pytorch/neighbors/getNeighborPairs.py     | 48 ++++++---
 src/pytorch/neighbors/getNeighborPairsCPU.cpp | 27 ++---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 98 ++++++++++---------
 src/pytorch/neighbors/neighbors.cpp           |  2 +-
 5 files changed, 115 insertions(+), 78 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 2391f648..6c3ef655 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -138,17 +138,25 @@ def test_neighbor_grads(dtype, num_atoms, grad):
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_too_many_neighbors(device, dtype):
-
     if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
-
     # 4 points result into 6 pairs, but there is a storage just for 4.
+    positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
-        positions = pt.zeros((4, 3,), device=device, dtype=dtype)
-        # Omitting check_errors makes the exception non-catchable
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
+        # checkErrors = False will throw due to exceeding neighbours
+        # syncExceptions = True makes  this exception catchable at the
+        # expense of performance (even when no error ocurred)
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False, sync_exceptions=True)
         pt.cuda.synchronize()
 
+    # checkErrors = True will never throw due to exceeding neighbours,
+    # but  will return  the number  of pairs  found.
+    # syncExceptions is ignored in this case
+    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
+    assert number_found_pairs == 6
+
+
+
 
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 5b8a0f29..052741d6 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -2,9 +2,14 @@
 from typing import Optional, Tuple
 
 
-def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
-    '''
-    Returns indices and distances of atom pairs within a given cutoff distance.
+def getNeighborPairs(
+        positions: Tensor,
+        cutoff: float,
+        max_num_neighbors: int = -1,
+        box_vectors: Optional[Tensor] = None,
+        check_errors: Optional[bool] = False,
+        sync_exceptions: Optional[bool] = False) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
+    '''Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
     i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for
@@ -45,12 +50,18 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
     check_errors: bool, optional
-        If set to True the function is guaranteed to throw if an error
-        is encountered, synchronizing if necessary.
-        If set to False, an error arising in this function might crash
-        the code at some point after calling it.
+        If set to True the function does not throw due to a number of pairs larger than the maximum.
+        If set to False, an exception will be thrown in that case.
+        Defaults to False.
+    sync_exceptions: bool, optional
+        If  set to  True the  function will  synchronize to  check for
+        errors  and  throw  an  exception  in  the  caller  thread  if
+        necessary.
+        If set  to False it  is possible  that an exception  thrown by
+        this function cannot bbe catched and result in a crash.
+        This flag is ignored if check_errors is True.
+        This flag must be False for the getNeighborPairs operation to be CUDA graph compatible.
         Defaults to False.
-
     Returns
     -------
     neighbors: `torch.Tensor`
@@ -69,19 +80,26 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
         If an atom pair is separated by a larger distance than the cutoff,
         the distance is set to `NaN`.
 
+    number_found_pairs: `Optional[torch.Tensor]`
+        Present  if check_errors=True.  Contains the  total number  of
+        pairs    found,    which    might   exceed    the    requested
+        max_num_neighbors,  leaving  the  rest  of the  output  in  an
+        undefined state.
+
     Exceptions
     ----------
-    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised.
+    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=True.
 
     Note
     ----
-    The operation is compatible with CUDA Grahps, i.e. the shapes of the output
-    tensors are independed of the values of input tensors.
+    The operation can be compatible with CUDA Grahps, i.e. the shapes of the output
+    tensors are independed of the values of input tensors and no synchronizing operation is performed.
+
+    For this to be the case sync_exceptions must be False.
 
     The CUDA implementation returns the atom pairs in non-determinist order,
     if `max_num_neighbors > 0`.
 
-    The check_errors argument is forced to False if CUDA graphs are used.
 
     Examples
     --------
@@ -132,4 +150,8 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
 
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
-    return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors)
+    neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions)
+    if check_errors is True:
+        return neighbors, deltas, distances, number_found_pairs
+    else:
+        return neighbors, deltas, distances
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index 5d6e04e2..c5bfb0c5 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -16,10 +16,11 @@ using torch::Tensor;
 using torch::outer;
 using torch::round;
 
-static tuple<Tensor, Tensor, Tensor> forward(const Tensor& positions,
-                                             const Scalar& cutoff,
-                                             const Scalar& max_num_neighbors,
-                                             const Tensor& box_vectors) {
+static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
+						     const Scalar& cutoff,
+						     const Scalar& max_num_neighbors,
+						     const Tensor& box_vectors,
+						     bool checkErrors) {
 
     TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
     TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
@@ -82,24 +83,26 @@ static tuple<Tensor, Tensor, Tensor> forward(const Tensor& positions,
         distances = distances.index({mask});
 
         const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0);
-        TORCH_CHECK(num_pad >= 0,
-            "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
-
+        if (!checkErrors) {
+            TORCH_CHECK(num_pad >= 0,
+                "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
+        }
         if (num_pad > 0) {
             neighbors = hstack({neighbors, full({2, num_pad}, -1, neighbors.options())});
             deltas = vstack({deltas, full({num_pad, 3}, NAN, deltas.options())});
             distances = hstack({distances, full({num_pad}, NAN, distances.options())});
         }
     }
-
-    return {neighbors, deltas, distances};
+    Tensor num_pairs_found = torch::empty(1, indices.options().dtype(kInt32));
+    num_pairs_found[0] = distances.size(0);
+    return {neighbors, deltas, distances, num_pairs_found};
 }
 
 TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
   m.impl("getNeighborPairs",
 	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
-	      const Tensor& box_vectors, const bool &checkErrors){
-	   //The checkErrors flag is ignored, this function always checks for errors synchronously
-	   return forward(positions, cutoff, max_num_neighbors, box_vectors);
+	      const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){
+	       //The syncExceptions flag is ignored, this function always throws synchronously
+	       return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors);
 	 });
 }
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 5a229fb3..639db5f5 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -106,37 +106,35 @@ template <typename scalar_t> __global__ void backward_kernel(
     atomicAdd(&grad_positions[i_atom][i_comp], grad);
 }
 
-namespace detail{
-  static std::exception_ptr tooManyNeighborsException = nullptr;
-  // Checks the  too many  neighbors flag and  stores an  exception if
-  // necessary to detail::tooManyNeighborsException.  This function is
-  // intended to be launched via cudaLaunchHostFunc.
-  //data  is a  void pointer  to a  std::tuple<int,bool>, storing  the
-  // maximum number  of neighbors and  whether to throw  an uncatchable
-  // exception here (false) or store it for later (true).
-  void CUDART_CB checkTooManyNeighbors(void *data){
+static std::exception_ptr tooManyNeighborsException = nullptr;
+// Checks the  too many  neighbors flag and  stores an  exception if
+// necessary to detail::tooManyNeighborsException.  This function is
+// intended to be launched via cudaLaunchHostFunc.
+// data  is a  void pointer  to a  std::tuple<int,bool,bool>, storing  the
+// maximum number  of neighbors and  whether to throw  an uncatchable
+// exception here or store it for later.
+static void CUDART_CB checkTooManyNeighbors(void* data) {
     int max_num_neighbors;
     bool checkErrors;
-    std::tie(max_num_neighbors, checkErrors) = *static_cast<std::tuple<int, bool>*>(data);
+    bool syncExceptions;
+    std::tie(max_num_neighbors, checkErrors, syncExceptions) = *static_cast<std::tuple<int, bool, bool>*>(data);
     // An exception thrown  in a stream callback is  not catchable (it
     // runs in another thread), so we store it in an exception_ptr for
     // it  to be  processed sometime  later in  the main  thread.  For
     // performance   reasons,    the   exception   is    thrown   here
     // asynchronously if the checkErrors flag is set to false
-    try{
-      const int tooMan = tooManyNeighborsErrorFlag;
-      TORCH_CHECK(tooMan == 0,
-		  "Some particle has too many neighbors, found " +
-		  std::to_string(-tooMan) + " but max is " +
-		  std::to_string(max_num_neighbors));
-    }
-    catch(...){
-      if(not checkErrors)
-	throw;
-      else
-	tooManyNeighborsException = std::current_exception();
+    if (!checkErrors) {
+        try {
+            const int tooMan = tooManyNeighborsErrorFlag;
+            TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors));
+        }
+        catch (...) {
+            if (not syncExceptions)
+                throw;
+            else
+                tooManyNeighborsException = std::current_exception();
+        }
     }
-  }
 }
 
 class Autograd : public Function<Autograd> {
@@ -146,7 +144,8 @@ public:
                                const Scalar& cutoff,
                                const Scalar& max_num_neighbors,
                                const Tensor& box_vectors,
-			       bool checkErrors) {
+			       bool checkErrors,
+			       bool syncExceptions) {
 
         TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
@@ -200,25 +199,30 @@ public:
                 get_accessor<scalar_t, 1>(distances),
                 get_accessor<scalar_t, 2>(box_vectors));
         });
-	//Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
-	cudaHostFn_t h_fn = detail::checkTooManyNeighbors;
-	std::tuple<int, bool> h_fn_data = {max_num_neighbors_, checkErrors};
-	cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
-	//Errors are thrown as exceptions  asynchronously and in a way
-	//compatible with CUDA graphs.   However, this way of throwing
-	//an exception  makes it  uncatchable, crashing the  code.  If
-	//the   checkErrors  flag   is   set  to   true  an   explicit
-	//synchronization barrier  here forces to throw  the exception
-	//from the main thread, making  it catchable at the expense of
-	//a performance penalty each time the function is called.
-	if(checkErrors){
-	  cudaStreamSynchronize(stream);
-	  if(detail::tooManyNeighborsException)
-	    std::rethrow_exception(detail::tooManyNeighborsException);
-	}
-	ctx->save_for_backward({neighbors, deltas, distances});
+        // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
+        cudaHostFn_t h_fn = checkTooManyNeighbors;
+        std::tuple<int, bool, bool> h_fn_data = {max_num_neighbors_, checkErrors, syncExceptions};
+        cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
+        // With chekErrors=false and syncExceptions=false the state of
+        // the tooManyErrorsFlag is checked  and exceptions are thrown
+        // asynchronously and  in a  way compatible with  CUDA graphs.
+        // However,  this  way  of  throwing  an  exception  makes  it
+        // uncatchable, crashing  the code.
+	//If  checkErrors=false  the syncExceptions=true  an  explicit
+        // synchronization barrier here forces  to throw the exception
+        // from the main thread, making it catchable at the expense of
+        // a performance penalty each time the function is called.
+	//Otherwise, if  checkErrors=true, no exception is  thrown and
+	//the user is  responsible to check if the number  of pairs is
+	//too high
+        if (!checkErrors && syncExceptions) {
+            cudaStreamSynchronize(stream);
+            if (tooManyNeighborsException)
+                std::rethrow_exception(tooManyNeighborsException);
+        }
+        ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
-        return {neighbors, deltas, distances};
+        return {neighbors, deltas, distances, i_curr_pair};
     }
 
     static tensor_list backward(AutogradContext* ctx, tensor_list grad_inputs) {
@@ -249,16 +253,16 @@ public:
                 get_accessor<scalar_t, 2>(grad_positions));
         });
 
-        return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor()};
+        return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor(), Tensor()};
       }
 };
 
 TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
   m.impl("getNeighborPairs",
 	 [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
-	    const Tensor& box_vectors, const bool &checkErrors){
-	   const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors,
-						       box_vectors, checkErrors);
-	   return make_tuple(results[0], results[1], results[2]);
+	    const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){
+	     const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors,
+							 box_vectors, checkErrors, syncExceptions);
+	     return make_tuple(results[0], results[1], results[2], results[3]);
 	 });
 }
diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp
index 4614160d..d5588608 100644
--- a/src/pytorch/neighbors/neighbors.cpp
+++ b/src/pytorch/neighbors/neighbors.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
 
 TORCH_LIBRARY(neighbors, m) {
-    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances)");
+    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors, bool syncExceptions) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)");
 }

From 477e9cd5f1e31c187cbfbc6b1a7b44e5de7d013a Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Mon, 6 Mar 2023 16:34:14 +0100
Subject: [PATCH 05/41] Make getNeighborPairs CUDA-graph compatible, add test
 for it

---
 src/pytorch/neighbors/TestNeighbors.py        | 35 +++++++++++++++++++
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 25 ++++++++-----
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 6c3ef655..2a345601 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -156,6 +156,41 @@ def test_too_many_neighbors(device, dtype):
     assert number_found_pairs == 6
 
 
+def test_is_cuda_graph_compatible():
+    device = 'cuda'
+    dtype = pt.float32
+    num_atoms = 100
+    # Generate random positions
+    positions = 10 * pt.randn((num_atoms, 3), device=device, dtype=dtype)
+    cutoff = 5
+    # Get neighbor pairs
+    ref_neighbors = np.vstack(np.tril_indices(num_atoms, -1))
+    ref_positions = positions.cpu().numpy()
+    ref_deltas = ref_positions[ref_neighbors[0]] - ref_positions[ref_neighbors[1]]
+    ref_distances = np.linalg.norm(ref_deltas, axis=1)
+
+    # Filter the neighbor pairs
+    mask = ref_distances > cutoff
+    ref_neighbors[:, mask] = -1
+    ref_deltas[mask, :] = np.nan
+    ref_distances[mask] = np.nan
+
+    # Find the number of neighbors
+    num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances)))
+
+    graph = pt.cuda.CUDAGraph()
+    s = pt.cuda.Stream()
+    s.wait_stream(pt.cuda.current_stream())
+    with pt.cuda.stream(s):
+        for _ in range(3):
+            neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+    pt.cuda.synchronize()
+
+    with pt.cuda.graph(graph):
+        neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+
+    graph.replay()
+    pt.cuda.synchronize()
 
 
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 639db5f5..7e2feac3 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -28,7 +28,7 @@ template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t
 template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
 template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };
 
-__device__ __managed__ int32_t tooManyNeighborsErrorFlag; //Error flag for forward_kernel
+__device__ __managed__ int32_t tooManyNeighborsErrorFlag; // Error flag for forward_kernel
 
 template <typename scalar_t> __global__ void forward_kernel(
     const int32_t num_all_pairs,
@@ -137,6 +137,12 @@ static void CUDART_CB checkTooManyNeighbors(void* data) {
     }
 }
 
+static bool isStreamCapturing(cudaStream_t st) {
+    cudaStreamCaptureStatus graphStatus;
+    cudaStreamIsCapturing(st, &graphStatus);
+    return graphStatus == cudaStreamCaptureStatusActive;
+}
+
 class Autograd : public Function<Autograd> {
 public:
     static tensor_list forward(AutogradContext* ctx,
@@ -146,7 +152,14 @@ public:
                                const Tensor& box_vectors,
 			       bool checkErrors,
 			       bool syncExceptions) {
-
+        const auto stream = getCurrentCUDAStream(positions.get_device());
+        bool isCUDAGraphCapturing = isStreamCapturing(stream);
+        // Advice CUDA on expected usage of the error flag
+        if (!isCUDAGraphCapturing) {
+            cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
+            cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetReadMostly, 0);
+        }
+        const CUDAStreamGuard guard(stream);
         TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
         TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3");
@@ -170,20 +183,14 @@ public:
 
         const int num_threads = 128;
         const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1);
-        const auto stream = getCurrentCUDAStream(positions.get_device());
 
         const TensorOptions options = positions.options();
         const Tensor i_curr_pair = zeros(1, options.dtype(kInt32));
         const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32));
         const Tensor deltas = full({num_pairs, 3}, NAN, options);
         const Tensor distances = full(num_pairs, NAN, options);
-	//Advice CUDA on expected usage of the error flag
-	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
-		      cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
-	cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
-		      cudaMemAdviseSetReadMostly, 0);
+
 	tooManyNeighborsErrorFlag = 0;
-	const CUDAStreamGuard guard(stream);
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
 	  const scalar_t cutoff_ = cutoff.to<scalar_t>();
 	  TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");

From 822c6916c50f7d2007f5c57d731a7e247c4a694d Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Mon, 6 Mar 2023 16:36:41 +0100
Subject: [PATCH 06/41] Remove incorrect comment

---
 src/pytorch/neighbors/TestNeighbors.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 2a345601..834cbf81 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -132,9 +132,6 @@ def test_neighbor_grads(dtype, num_atoms, grad):
     else:
         assert pt.allclose(positions_cpu.grad, positions_cuda.grad.cpu(), atol=1e-8, rtol=1e-5)
 
-# The following test is only run on the CPU.  Running it on the GPU triggers a
-# CUDA assertion, which causes all tests run after it to fail.
-
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_too_many_neighbors(device, dtype):

From e46fe2d398a98670eaddb4e4e427098fb9a91125 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Mon, 6 Mar 2023 16:43:38 +0100
Subject: [PATCH 07/41] Change not by !

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 7e2feac3..04bdb474 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -129,7 +129,7 @@ static void CUDART_CB checkTooManyNeighbors(void* data) {
             TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors));
         }
         catch (...) {
-            if (not syncExceptions)
+            if (!syncExceptions)
                 throw;
             else
                 tooManyNeighborsException = std::current_exception();

From e80cd5ebcc7a959cab03e20ad316a0e1b6cfe744 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Tue, 7 Mar 2023 18:46:56 +0100
Subject: [PATCH 08/41] Move all torch.ops.load calls to the __init__.py
 scripts

---
 src/pytorch/BatchedNN.py          | 2 --
 src/pytorch/CFConv.py             | 6 +-----
 src/pytorch/CFConvNeighbors.py    | 5 +----
 src/pytorch/SymmetryFunctions.py  | 6 +-----
 src/pytorch/__init__.py           | 7 ++++++-
 src/pytorch/neighbors/__init__.py | 7 ++++++-
 6 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/pytorch/BatchedNN.py b/src/pytorch/BatchedNN.py
index 7902be42..79f51ac0 100644
--- a/src/pytorch/BatchedNN.py
+++ b/src/pytorch/BatchedNN.py
@@ -21,14 +21,12 @@
 # SOFTWARE.
 #
 
-import os
 import torch
 from torch import nn
 from torch import Tensor
 from torch.nn import functional as F
 from typing import List, NamedTuple, Tuple, Union
 
-torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
 batchedLinear = torch.ops.NNPOpsBatchedNN.BatchedLinear
 
 
diff --git a/src/pytorch/CFConv.py b/src/pytorch/CFConv.py
index 7229cf06..ab40c246 100644
--- a/src/pytorch/CFConv.py
+++ b/src/pytorch/CFConv.py
@@ -21,15 +21,11 @@
 # SOFTWARE.
 #
 
-import os.path
 import torch
 from torch import Tensor
 
 from NNPOps.CFConvNeighbors import CFConvNeighbors
 
-torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-
 class CFConv(torch.nn.Module):
     """
     Optimized continious-filter convolution layer (CFConv)
@@ -84,4 +80,4 @@ def __init__(self, gaussianWidth: float, activation: str,
 
     def forward(self, neighbors: CFConvNeighbors, positions: Tensor, input: Tensor) -> Tensor:
 
-        return CFConv.operation(self.holder, neighbors.holder, positions, input)
\ No newline at end of file
+        return CFConv.operation(self.holder, neighbors.holder, positions, input)
diff --git a/src/pytorch/CFConvNeighbors.py b/src/pytorch/CFConvNeighbors.py
index e5ca8a9b..144f1b53 100644
--- a/src/pytorch/CFConvNeighbors.py
+++ b/src/pytorch/CFConvNeighbors.py
@@ -21,12 +21,9 @@
 # SOFTWARE.
 #
 
-import os.path
 import torch
 from torch import Tensor
 
-torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-
 class CFConvNeighbors(torch.nn.Module):
     """
     Optimized nearest-neighbor implementation for the continious-filter convolution (CFConf)
@@ -45,4 +42,4 @@ def __init__(self, cutoff: float) -> None:
     @torch.jit.export
     def build(self, positions: Tensor) -> None:
 
-        self.holder.build(positions)
\ No newline at end of file
+        self.holder.build(positions)
diff --git a/src/pytorch/SymmetryFunctions.py b/src/pytorch/SymmetryFunctions.py
index b54aaa9c..99dd78ed 100644
--- a/src/pytorch/SymmetryFunctions.py
+++ b/src/pytorch/SymmetryFunctions.py
@@ -21,14 +21,10 @@
 # SOFTWARE.
 #
 
-import os.path
 from typing import List, Optional, Tuple
 import torch
 from torch import Tensor
 
-torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-
 Holder = torch.classes.NNPOpsANISymmetryFunctions.Holder
 operation = torch.ops.NNPOpsANISymmetryFunctions.operation
 
@@ -124,4 +120,4 @@ def forward(self, species_positions: Tuple[Tensor, Tensor],
         radial, angular = operation(self.holder, positions[0], cell)
         features = torch.cat((radial, angular), dim=1).unsqueeze(0)
 
-        return species, features
\ No newline at end of file
+        return species, features
diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py
index d60de89b..b6042421 100644
--- a/src/pytorch/__init__.py
+++ b/src/pytorch/__init__.py
@@ -1,5 +1,10 @@
 '''
 High-performance PyTorch operations for neural network potentials
 '''
+import os.path
+import site
+import torch
+torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
+torch.classes.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
 
-from NNPOps.OptimizedTorchANI import OptimizedTorchANI
\ No newline at end of file
+from NNPOps.OptimizedTorchANI import OptimizedTorchANI
diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py
index 4869f31d..a81e6a2c 100644
--- a/src/pytorch/neighbors/__init__.py
+++ b/src/pytorch/neighbors/__init__.py
@@ -1,5 +1,10 @@
 '''
 Neighbor operations
 '''
+import site
+import os
+import torch
 
-from NNPOps.neighbors.getNeighborPairs import getNeighborPairs
\ No newline at end of file
+torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
+
+from NNPOps.neighbors.getNeighborPairs import getNeighborPairs

From 2a7cd3a8968b14c2a5c9f91c35f0a9b0e1cb96b2 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 9 Mar 2023 09:32:03 +0100
Subject: [PATCH 09/41] Change how the location of libNNPOpsPyTorch.so is found
 at __init__ scripts

---
 src/pytorch/__init__.py           | 6 +++---
 src/pytorch/neighbors/__init__.py | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py
index b6042421..d178744c 100644
--- a/src/pytorch/__init__.py
+++ b/src/pytorch/__init__.py
@@ -2,9 +2,9 @@
 High-performance PyTorch operations for neural network potentials
 '''
 import os.path
-import site
 import torch
-torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
-torch.classes.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
+
+torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
+torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
 
 from NNPOps.OptimizedTorchANI import OptimizedTorchANI
diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py
index a81e6a2c..0bf85d3f 100644
--- a/src/pytorch/neighbors/__init__.py
+++ b/src/pytorch/neighbors/__init__.py
@@ -1,10 +1,9 @@
 '''
 Neighbor operations
 '''
-import site
-import os
+import os.path
 import torch
 
-torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so"))
+torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
 
 from NNPOps.neighbors.getNeighborPairs import getNeighborPairs

From e4df3cf07c2c5a49a6e8865cc8af4bcb134fc192 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 9 Mar 2023 10:19:58 +0100
Subject: [PATCH 10/41] Remove spurious lines in CMakeLists.txt

---
 CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23badd95..d748c974 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,8 +77,6 @@ install(FILES src/pytorch/__init__.py
               src/pytorch/OptimizedTorchANI.py
               src/pytorch/SpeciesConverter.py
               src/pytorch/SymmetryFunctions.py
-              src/pytorch/neighbors/__init__.py
-              src/pytorch/neighbors/getNeighborPairs.py
         DESTINATION ${Python3_SITEARCH}/${NAME})
 install(FILES src/pytorch/neighbors/__init__.py
               src/pytorch/neighbors/getNeighborPairs.py

From d6eb763775ba172e9fa2a13fb96668c14bc84f0f Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 9 Mar 2023 10:28:32 +0100
Subject: [PATCH 11/41] Update again how libNNPOpsPyTorch.so is found in
 __init__.py

---
 src/pytorch/__init__.py           | 12 ++++++++++--
 src/pytorch/neighbors/__init__.py | 12 +++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py
index d178744c..7a62ad6a 100644
--- a/src/pytorch/__init__.py
+++ b/src/pytorch/__init__.py
@@ -3,8 +3,16 @@
 '''
 import os.path
 import torch
+import site
+
+# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages()
+for path in site.getsitepackages():
+    if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')):
+        torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so'))
+        break
+else:
+    # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file
+    torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
 
-torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
-torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
 
 from NNPOps.OptimizedTorchANI import OptimizedTorchANI
diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py
index 0bf85d3f..c0dd2ed5 100644
--- a/src/pytorch/neighbors/__init__.py
+++ b/src/pytorch/neighbors/__init__.py
@@ -3,7 +3,17 @@
 '''
 import os.path
 import torch
+import site
+
+# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages()
+for path in site.getsitepackages():
+    if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')):
+        torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so'))
+        break
+else:
+    # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file
+    torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
+
 
-torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
 
 from NNPOps.neighbors.getNeighborPairs import getNeighborPairs

From ca821c3fb0eb554a6c68e6516c8b605c883a0053 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 9 Mar 2023 10:31:48 +0100
Subject: [PATCH 12/41] Remove redundant torch load

---
 src/pytorch/neighbors/__init__.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py
index c0dd2ed5..6e3d5f60 100644
--- a/src/pytorch/neighbors/__init__.py
+++ b/src/pytorch/neighbors/__init__.py
@@ -1,19 +1,6 @@
 '''
 Neighbor operations
 '''
-import os.path
 import torch
-import site
-
-# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages()
-for path in site.getsitepackages():
-    if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')):
-        torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so'))
-        break
-else:
-    # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file
-    torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
-
-
 
 from NNPOps.neighbors.getNeighborPairs import getNeighborPairs

From 676f83be7f968c5cc2e6fd20b3879caa04a0ad93 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 9 Mar 2023 12:38:04 +0100
Subject: [PATCH 13/41] Skip CUDA graph test if no GPU is available

---
 src/pytorch/neighbors/TestNeighbors.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index e8bcbfbc..92e13eda 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -94,7 +94,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
 @pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100, 1000])
 @pytest.mark.parametrize('grad', ['deltas', 'distances', 'combined'])
 def test_neighbor_grads(device, dtype, num_atoms, grad):
-    
+
     if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
 
@@ -115,7 +115,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad):
     positions.requires_grad_(True)
     print(positions)
     neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff)
-    
+
     assert pt.all(neighbors > -1)
     assert pt.all(neighbors == ref_neighbors)
     assert pt.allclose(deltas, ref_deltas)
@@ -133,7 +133,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad):
         (deltas.sum() + distances.sum()).backward()
     else:
         raise ValueError('grad')
-    
+
     if dtype == pt.float32:
         assert pt.allclose(ref_positions.grad, positions.grad, atol=1e-3, rtol=1e-3)
     else:
@@ -162,6 +162,8 @@ def test_too_many_neighbors(device, dtype):
 
 
 def test_is_cuda_graph_compatible():
+    if not pt.cuda.is_available():
+        pytest.skip('No GPU')
     device = 'cuda'
     dtype = pt.float32
     num_atoms = 100

From d05656b6fddd002750aecb80a227c5df734511fe Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Tue, 14 Mar 2023 12:47:16 +0100
Subject: [PATCH 14/41] Remove incorrect path in __init__

---
 src/pytorch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py
index 7a62ad6a..56119c13 100644
--- a/src/pytorch/__init__.py
+++ b/src/pytorch/__init__.py
@@ -12,7 +12,7 @@
         break
 else:
     # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file
-    torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so'))
+    torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
 
 
 from NNPOps.OptimizedTorchANI import OptimizedTorchANI

From 4fb4b2e6d22bdb8c15b2c59cbaa117576a2b6393 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 15 Mar 2023 10:21:10 +0100
Subject: [PATCH 15/41] Use relative path to load NNPOps library in __init__.py

---
 src/pytorch/__init__.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py
index 56119c13..645d4399 100644
--- a/src/pytorch/__init__.py
+++ b/src/pytorch/__init__.py
@@ -3,16 +3,8 @@
 '''
 import os.path
 import torch
-import site
 
-# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages()
-for path in site.getsitepackages():
-    if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')):
-        torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so'))
-        break
-else:
-    # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file
-    torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
+torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so'))
 
 
 from NNPOps.OptimizedTorchANI import OptimizedTorchANI

From bf5658017db3459d8d4ae9f3dc1f02bc5ada71a6 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 15 Mar 2023 10:40:35 +0100
Subject: [PATCH 16/41] Copy test scripts to build directory, run them there

---
 CMakeLists.txt | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d748c974..6c414374 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,16 +56,26 @@ foreach(TEST_PATH ${TEST_PATHS})
     add_test(${TEST_NAME} ${TEST_NAME})
 endforeach()
 
-# Tests of PyTorch wrappers
-add_test(TestBatchedNN         pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestBatchedNN.py)
-add_test(TestCFConv            pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestCFConv.py)
-add_test(TestCFConvNeighbors   pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestCFConvNeighbors.py)
-add_test(TestEnergyShifter     pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestEnergyShifter.py)
-add_test(TestOptimizedTorchANI pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestOptimizedTorchANI.py)
-add_test(TestSpeciesConverter  pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpeciesConverter.py)
-add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py)
-add_test(TestNeighbors         pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py)
-add_test(TestGetNeighborPairs  pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py)
+# Move test scripts to a test folder in the build directory, create test folder if necessary
+add_custom_command(TARGET ${LIBRARY} POST_BUILD	COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test)
+add_custom_command(TARGET ${LIBRARY} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+  ${CMAKE_SOURCE_DIR}/src/pytorch/Test*.py
+  ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/Test*.py
+  ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py
+  ${CMAKE_BINARY_DIR}/test)
+add_custom_command(TARGET ${LIBRARY} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory
+  ${CMAKE_SOURCE_DIR}/src/pytorch/molecules
+  ${CMAKE_BINARY_DIR}/test/molecules)
+
+# Add tests for all scripts in the test directory
+file(GLOB_RECURSE PYTHON_TEST_PATHS ${CMAKE_BINARY_DIR}/test/Test*.py)
+foreach(TEST_PATH ${PYTHON_TEST_PATHS})
+		cmake_path(GET TEST_PATH STEM TEST_NAME)
+		add_test(${TEST_NAME} pytest -v ${CMAKE_BINARY_DIR}/test/${TEST_NAME}.py)
+endforeach()
+add_test(TestGetNeighborPairs  pytest -v --doctest-modules ${CMAKE_BINARY_DIR}/test/getNeighborPairs.py)
 
 # Installation
 install(TARGETS ${LIBRARY} DESTINATION ${Python3_SITEARCH}/${NAME})

From 947f4d874abdf0eead0ff55506cb9707945e6bd6 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 15 Mar 2023 10:42:30 +0100
Subject: [PATCH 17/41] Remove unnecessary import

---
 src/pytorch/neighbors/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py
index 6e3d5f60..f6679583 100644
--- a/src/pytorch/neighbors/__init__.py
+++ b/src/pytorch/neighbors/__init__.py
@@ -1,6 +1,5 @@
 '''
 Neighbor operations
 '''
-import torch
 
 from NNPOps.neighbors.getNeighborPairs import getNeighborPairs

From ae82f90aa481aa148bb052f8192bfbe80c3f4b42 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 17 Mar 2023 18:42:42 +0100
Subject: [PATCH 18/41] Some fixes for CUDA graph support in getNEighborPairs

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 65 +++++++++----------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 04bdb474..d1494545 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -115,25 +115,22 @@ static std::exception_ptr tooManyNeighborsException = nullptr;
 // exception here or store it for later.
 static void CUDART_CB checkTooManyNeighbors(void* data) {
     int max_num_neighbors;
-    bool checkErrors;
     bool syncExceptions;
-    std::tie(max_num_neighbors, checkErrors, syncExceptions) = *static_cast<std::tuple<int, bool, bool>*>(data);
+    std::tie(max_num_neighbors, syncExceptions) = *static_cast<std::tuple<int, bool>*>(data);
     // An exception thrown  in a stream callback is  not catchable (it
     // runs in another thread), so we store it in an exception_ptr for
     // it  to be  processed sometime  later in  the main  thread.  For
     // performance   reasons,    the   exception   is    thrown   here
     // asynchronously if the checkErrors flag is set to false
-    if (!checkErrors) {
-        try {
-            const int tooMan = tooManyNeighborsErrorFlag;
-            TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors));
-        }
-        catch (...) {
-            if (!syncExceptions)
-                throw;
-            else
-                tooManyNeighborsException = std::current_exception();
-        }
+    try {
+      const int tooMan = tooManyNeighborsErrorFlag;
+      TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors));
+    }
+    catch (...) {
+      if (!syncExceptions)
+	throw;
+      else
+	tooManyNeighborsException = std::current_exception();
     }
 }
 
@@ -164,8 +161,7 @@ public:
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
         TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3");
         TORCH_CHECK(positions.is_contiguous(), "Expected \"positions\" to be contiguous");
-
-        const int max_num_neighbors_ = max_num_neighbors.to<int>();
+	int max_num_neighbors_ = max_num_neighbors.to<int>();
         TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1,
             "Expected \"max_num_neighbors\" to be positive or equal to -1");
 
@@ -207,26 +203,29 @@ public:
                 get_accessor<scalar_t, 2>(box_vectors));
         });
         // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
-        cudaHostFn_t h_fn = checkTooManyNeighbors;
-        std::tuple<int, bool, bool> h_fn_data = {max_num_neighbors_, checkErrors, syncExceptions};
-        cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
-        // With chekErrors=false and syncExceptions=false the state of
-        // the tooManyErrorsFlag is checked  and exceptions are thrown
-        // asynchronously and  in a  way compatible with  CUDA graphs.
-        // However,  this  way  of  throwing  an  exception  makes  it
-        // uncatchable, crashing  the code.
-	//If  checkErrors=false  the syncExceptions=true  an  explicit
-        // synchronization barrier here forces  to throw the exception
-        // from the main thread, making it catchable at the expense of
-        // a performance penalty each time the function is called.
-	//Otherwise, if  checkErrors=true, no exception is  thrown and
-	//the user is  responsible to check if the number  of pairs is
-	//too high
-        if (!checkErrors && syncExceptions) {
+	if(!checkErrors){
+	  static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors;
+	  static std::tuple<int, bool> h_fn_data;
+	  h_fn_data = {max_num_neighbors_, syncExceptions};
+	  cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
+	  // With chekErrors=false and syncExceptions=false the state of
+	  // the tooManyErrorsFlag is checked  and exceptions are thrown
+	  // asynchronously and  in a  way compatible with  CUDA graphs.
+	  // However,  this  way  of  throwing  an  exception  makes  it
+	  // uncatchable, crashing  the code.
+	  //If  checkErrors=false  the syncExceptions=true  an  explicit
+	  // synchronization barrier here forces  to throw the exception
+	  // from the main thread, making it catchable at the expense of
+	  // a performance penalty each time the function is called.
+	  //Otherwise, if  checkErrors=true, no exception is  thrown and
+	  //the user is  responsible to check if the number  of pairs is
+	  //too high
+	  if (syncExceptions) {
             cudaStreamSynchronize(stream);
             if (tooManyNeighborsException)
-                std::rethrow_exception(tooManyNeighborsException);
-        }
+	      std::rethrow_exception(tooManyNeighborsException);
+	  }
+	}
         ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
         return {neighbors, deltas, distances, i_curr_pair};

From 46256842954cc014709ee5903e0a75435e7036f6 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 16:29:13 +0100
Subject: [PATCH 19/41] Reverse logic for check_errors in getNeighborPairs.py

---
 src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 052741d6..d06e7aec 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -50,16 +50,16 @@ def getNeighborPairs(
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
     check_errors: bool, optional
-        If set to True the function does not throw due to a number of pairs larger than the maximum.
-        If set to False, an exception will be thrown in that case.
+        If set to False the function does not raise due to a number of pairs larger than the maximum.
+        If set to True, an exception will be raised in that case.
         Defaults to False.
     sync_exceptions: bool, optional
         If  set to  True the  function will  synchronize to  check for
-        errors  and  throw  an  exception  in  the  caller  thread  if
+        errors  and  raise  an  exception  in  the  caller  thread  if
         necessary.
-        If set  to False it  is possible  that an exception  thrown by
+        If set  to False it  is possible  that an exception  raised by
         this function cannot bbe catched and result in a crash.
-        This flag is ignored if check_errors is True.
+        This flag is ignored if check_errors is False.
         This flag must be False for the getNeighborPairs operation to be CUDA graph compatible.
         Defaults to False.
     Returns
@@ -81,14 +81,14 @@ def getNeighborPairs(
         the distance is set to `NaN`.
 
     number_found_pairs: `Optional[torch.Tensor]`
-        Present  if check_errors=True.  Contains the  total number  of
+        Present  if check_errors=False.  Contains the  total number  of
         pairs    found,    which    might   exceed    the    requested
         max_num_neighbors,  leaving  the  rest  of the  output  in  an
         undefined state.
 
     Exceptions
     ----------
-    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=True.
+    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=False.
 
     Note
     ----
@@ -151,7 +151,7 @@ def getNeighborPairs(
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions)
-    if check_errors is True:
+    if check_errors is False:
         return neighbors, deltas, distances, number_found_pairs
     else:
         return neighbors, deltas, distances

From d711a3ca7e3889661d93eff001d46fcc9167a48b Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 16:37:25 +0100
Subject: [PATCH 20/41] Reverse check_errors flag in the rest of the
 getNeighborPair-related files

---
 src/pytorch/neighbors/TestNeighbors.py        |  8 +++----
 src/pytorch/neighbors/getNeighborPairs.py     | 23 +++++++++++--------
 src/pytorch/neighbors/getNeighborPairsCPU.cpp |  2 +-
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 10 ++++----
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 92e13eda..49e01b72 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -148,16 +148,16 @@ def test_too_many_neighbors(device, dtype):
     # 4 points result into 6 pairs, but there is a storage just for 4.
     positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
-        # checkErrors = False will throw due to exceeding neighbours
+        # checkErrors = True will throw due to exceeding neighbours
         # syncExceptions = True makes  this exception catchable at the
         # expense of performance (even when no error ocurred)
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False, sync_exceptions=True)
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True, sync_exceptions=True)
         pt.cuda.synchronize()
 
-    # checkErrors = True will never throw due to exceeding neighbours,
+    # checkErrors = False will never throw due to exceeding neighbours,
     # but  will return  the number  of pairs  found.
     # syncExceptions is ignored in this case
-    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
+    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)
     assert number_found_pairs == 6
 
 
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index d06e7aec..60913a04 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -3,13 +3,14 @@
 
 
 def getNeighborPairs(
-        positions: Tensor,
-        cutoff: float,
-        max_num_neighbors: int = -1,
-        box_vectors: Optional[Tensor] = None,
-        check_errors: Optional[bool] = False,
-        sync_exceptions: Optional[bool] = False) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
-    '''Returns indices and distances of atom pairs within a given cutoff distance.
+    positions: Tensor,
+    cutoff: float,
+    max_num_neighbors: int = -1,
+    box_vectors: Optional[Tensor] = None,
+    check_errors: Optional[bool] = False,
+    sync_exceptions: Optional[bool] = False,
+) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
+    """Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
     i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for
@@ -58,7 +59,7 @@ def getNeighborPairs(
         errors  and  raise  an  exception  in  the  caller  thread  if
         necessary.
         If set  to False it  is possible  that an exception  raised by
-        this function cannot bbe catched and result in a crash.
+        this function cannot be catched and result in a crash.
         This flag is ignored if check_errors is False.
         This flag must be False for the getNeighborPairs operation to be CUDA graph compatible.
         Defaults to False.
@@ -146,11 +147,13 @@ def getNeighborPairs(
              [nan, nan, nan]]),
      tensor([1., 1., nan, nan, nan, nan]))
 
-    '''
+    """
 
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
-    neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions)
+    neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
+        positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
+    )
     if check_errors is False:
         return neighbors, deltas, distances, number_found_pairs
     else:
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index e35fbb96..f4de2d29 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -84,7 +84,7 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
         distances = distances.index({mask});
 
         const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0);
-        if (!checkErrors) {
+        if (checkErrors) {
             TORCH_CHECK(num_pad >= 0,
                 "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
         }
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index d1494545..b734f3b2 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -110,7 +110,7 @@ static std::exception_ptr tooManyNeighborsException = nullptr;
 // Checks the  too many  neighbors flag and  stores an  exception if
 // necessary to detail::tooManyNeighborsException.  This function is
 // intended to be launched via cudaLaunchHostFunc.
-// data  is a  void pointer  to a  std::tuple<int,bool,bool>, storing  the
+// data  is a  void pointer  to a  std::tuple<int,bool>, storing  the
 // maximum number  of neighbors and  whether to throw  an uncatchable
 // exception here or store it for later.
 static void CUDART_CB checkTooManyNeighbors(void* data) {
@@ -121,7 +121,7 @@ static void CUDART_CB checkTooManyNeighbors(void* data) {
     // runs in another thread), so we store it in an exception_ptr for
     // it  to be  processed sometime  later in  the main  thread.  For
     // performance   reasons,    the   exception   is    thrown   here
-    // asynchronously if the checkErrors flag is set to false
+    // asynchronously if the checkErrors flag is set to true
     try {
       const int tooMan = tooManyNeighborsErrorFlag;
       TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors));
@@ -203,7 +203,7 @@ public:
                 get_accessor<scalar_t, 2>(box_vectors));
         });
         // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
-	if(!checkErrors){
+	if(checkErrors){
 	  static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors;
 	  static std::tuple<int, bool> h_fn_data;
 	  h_fn_data = {max_num_neighbors_, syncExceptions};
@@ -213,11 +213,11 @@ public:
 	  // asynchronously and  in a  way compatible with  CUDA graphs.
 	  // However,  this  way  of  throwing  an  exception  makes  it
 	  // uncatchable, crashing  the code.
-	  //If  checkErrors=false  the syncExceptions=true  an  explicit
+	  //If  checkErrors=true  the syncExceptions=true  an  explicit
 	  // synchronization barrier here forces  to throw the exception
 	  // from the main thread, making it catchable at the expense of
 	  // a performance penalty each time the function is called.
-	  //Otherwise, if  checkErrors=true, no exception is  thrown and
+	  //Otherwise, if  checkErrors=false, no exception is  thrown and
 	  //the user is  responsible to check if the number  of pairs is
 	  //too high
 	  if (syncExceptions) {

From 400ceede3c8b25ca1a04573d87f6bd0d918deda7 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 16:39:27 +0100
Subject: [PATCH 21/41] Clarify documentation on the error raised by
 getNeighborPairs

---
 src/pytorch/neighbors/getNeighborPairs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 60913a04..e5e28114 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -52,7 +52,7 @@ def getNeighborPairs(
         If this is omitted, periodic boundary conditions are not applied.
     check_errors: bool, optional
         If set to False the function does not raise due to a number of pairs larger than the maximum.
-        If set to True, an exception will be raised in that case.
+        If set to True, a RuntimeError will be raised in that case.
         Defaults to False.
     sync_exceptions: bool, optional
         If  set to  True the  function will  synchronize to  check for

From c36243b36955fa13e4c2f0f1dc552bfa11a60d66 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 16:42:59 +0100
Subject: [PATCH 22/41] Always return the number of found pairs in
 getNeighborPairs

---
 src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index e5e28114..544d8124 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -9,7 +9,7 @@ def getNeighborPairs(
     box_vectors: Optional[Tensor] = None,
     check_errors: Optional[bool] = False,
     sync_exceptions: Optional[bool] = False,
-) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
@@ -81,11 +81,10 @@ def getNeighborPairs(
         If an atom pair is separated by a larger distance than the cutoff,
         the distance is set to `NaN`.
 
-    number_found_pairs: `Optional[torch.Tensor]`
-        Present  if check_errors=False.  Contains the  total number  of
-        pairs    found,    which    might   exceed    the    requested
-        max_num_neighbors,  leaving  the  rest  of the  output  in  an
-        undefined state.
+    number_found_pairs: `torch.Tensor`
+        Contains the total  number of pairs found,  which might exceed
+        the  requested  max_num_neighbors,  leaving the  rest  of  the
+        output in an undefined state.
 
     Exceptions
     ----------
@@ -154,7 +153,4 @@ def getNeighborPairs(
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
         positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
     )
-    if check_errors is False:
-        return neighbors, deltas, distances, number_found_pairs
-    else:
-        return neighbors, deltas, distances
+    return neighbors, deltas, distances, number_found_pairs

From 8da1c5d77bbe0c95036dcb8382d29ee4aefd62b0 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 17:00:26 +0100
Subject: [PATCH 23/41] Revert "Always return the number of found pairs in
 getNeighborPairs"

This reverts commit c36243b36955fa13e4c2f0f1dc552bfa11a60d66.
---
 src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 544d8124..e5e28114 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -9,7 +9,7 @@ def getNeighborPairs(
     box_vectors: Optional[Tensor] = None,
     check_errors: Optional[bool] = False,
     sync_exceptions: Optional[bool] = False,
-) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
@@ -81,10 +81,11 @@ def getNeighborPairs(
         If an atom pair is separated by a larger distance than the cutoff,
         the distance is set to `NaN`.
 
-    number_found_pairs: `torch.Tensor`
-        Contains the total  number of pairs found,  which might exceed
-        the  requested  max_num_neighbors,  leaving the  rest  of  the
-        output in an undefined state.
+    number_found_pairs: `Optional[torch.Tensor]`
+        Present  if check_errors=False.  Contains the  total number  of
+        pairs    found,    which    might   exceed    the    requested
+        max_num_neighbors,  leaving  the  rest  of the  output  in  an
+        undefined state.
 
     Exceptions
     ----------
@@ -153,4 +154,7 @@ def getNeighborPairs(
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
         positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
     )
-    return neighbors, deltas, distances, number_found_pairs
+    if check_errors is False:
+        return neighbors, deltas, distances, number_found_pairs
+    else:
+        return neighbors, deltas, distances

From 829ee5ba2c2787205cc1d259f8b095b99ebe3d26 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 22 Mar 2023 17:00:59 +0100
Subject: [PATCH 24/41] Fix check_error interpretation in getNeighborPairs.py

---
 src/pytorch/neighbors/getNeighborPairs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index e5e28114..6720c9c7 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -154,7 +154,7 @@ def getNeighborPairs(
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
         positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
     )
-    if check_errors is False:
+    if check_errors is True:
         return neighbors, deltas, distances, number_found_pairs
     else:
         return neighbors, deltas, distances

From 73c3e5859d5c7e7d579a02de5fc2b92cf6ed603a Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 23 Mar 2023 11:56:16 +0100
Subject: [PATCH 25/41] Add return number of pairs functionality again

This reverts commit 8da1c5d77bbe0c95036dcb8382d29ee4aefd62b0.
---
 src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 6720c9c7..544d8124 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -9,7 +9,7 @@ def getNeighborPairs(
     box_vectors: Optional[Tensor] = None,
     check_errors: Optional[bool] = False,
     sync_exceptions: Optional[bool] = False,
-) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]:
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
@@ -81,11 +81,10 @@ def getNeighborPairs(
         If an atom pair is separated by a larger distance than the cutoff,
         the distance is set to `NaN`.
 
-    number_found_pairs: `Optional[torch.Tensor]`
-        Present  if check_errors=False.  Contains the  total number  of
-        pairs    found,    which    might   exceed    the    requested
-        max_num_neighbors,  leaving  the  rest  of the  output  in  an
-        undefined state.
+    number_found_pairs: `torch.Tensor`
+        Contains the total  number of pairs found,  which might exceed
+        the  requested  max_num_neighbors,  leaving the  rest  of  the
+        output in an undefined state.
 
     Exceptions
     ----------
@@ -154,7 +153,4 @@ def getNeighborPairs(
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
         positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
     )
-    if check_errors is True:
-        return neighbors, deltas, distances, number_found_pairs
-    else:
-        return neighbors, deltas, distances
+    return neighbors, deltas, distances, number_found_pairs

From c2210f38484c96f27151298ab1ea798f664d509d Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 23 Mar 2023 12:35:39 +0100
Subject: [PATCH 26/41] Update tests with new getNeighborPairs interface

---
 src/pytorch/neighbors/TestNeighbors.py    | 12 ++++-----
 src/pytorch/neighbors/getNeighborPairs.py | 32 ++++++++++-------------
 2 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 10cd43d2..932af259 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -59,7 +59,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
     max_num_neighbors = -1 if all_pairs else max(int(np.ceil(num_neighbors / num_atoms)), 1)
 
     # Compute results
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors)
 
     # Check device
     assert neighbors.device == positions.device
@@ -114,7 +114,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad):
     # Compute values using NNPOps
     positions.requires_grad_(True)
     print(positions)
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff)
 
     assert pt.all(neighbors > -1)
     assert pt.all(neighbors == ref_neighbors)
@@ -190,11 +190,11 @@ def test_is_cuda_graph_compatible():
     s.wait_stream(pt.cuda.current_stream())
     with pt.cuda.stream(s):
         for _ in range(3):
-            neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
     pt.cuda.synchronize()
 
     with pt.cuda.graph(graph):
-        neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+        neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
 
     graph.replay()
     pt.cuda.synchronize()
@@ -234,7 +234,7 @@ def test_periodic_neighbors(device, dtype):
     max_num_neighbors = max(int(np.ceil(num_neighbors / num_atoms)), 1)
 
     # Compute results
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors)
 
     # Check device
     assert neighbors.device == positions.device
@@ -272,7 +272,7 @@ class ForceModule(pt.nn.Module):
 
         def forward(self, positions):
 
-            neighbors, deltas, distances = getNeighborPairs(positions, cutoff=1.0)
+            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=1.0)
             mask = pt.isnan(distances)
             distances = distances[~mask]
             return pt.sum(distances**2)
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 544d8124..e30c8dfa 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -114,7 +114,7 @@ def getNeighborPairs(
      tensor([[1., 0., 0.],
              [2., 0., 0.],
              [1., 0., 0.]]),
-     tensor([1., 2., 1.]))
+     tensor([1., 2., 1.]), tensor([3], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=1.5) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1, -1,  2],
@@ -122,29 +122,25 @@ def getNeighborPairs(
      tensor([[1., 0., 0.],
              [nan, nan, nan],
              [1., 0., 0.]]),
-     tensor([1., nan, 1.]))
+     tensor([1., nan, 1.]), tensor([3], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=3.0, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2,  2, -1, -1, -1],
-             [ 0,  0,  1, -1, -1, -1]], dtype=torch.int32),
-     tensor([[1., 0., 0.],
-             [2., 0., 0.],
-             [1., 0., 0.],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan]]),
-     tensor([1., 2., 1., nan, nan, nan]))
+            [ 0,  0,  1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
+            [2., 0., 0.],
+            [1., 0., 0.],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan]]), tensor([1., 2., 1., nan, nan, nan]), tensor([6], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=1.5, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2, -1, -1, -1, -1],
-             [ 0,  1, -1, -1, -1, -1]], dtype=torch.int32),
-     tensor([[1., 0., 0.],
-             [1., 0., 0.],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan]]),
-     tensor([1., 1., nan, nan, nan, nan]))
+             [ 0,  1, -1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
+            [1., 0., 0.],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan]]), tensor([1., 1., nan, nan, nan, nan]), tensor([6], dtype=torch.int32))
 
     """
 

From fba2b46650769c121b3a86ad61ae39ea35798b62 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Thu, 23 Mar 2023 12:35:52 +0100
Subject: [PATCH 27/41] Fix type decorator preventing jit.script from working
 on getNeighborPairs

---
 src/pytorch/neighbors/getNeighborPairs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index e30c8dfa..2a86e432 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -7,8 +7,8 @@ def getNeighborPairs(
     cutoff: float,
     max_num_neighbors: int = -1,
     box_vectors: Optional[Tensor] = None,
-    check_errors: Optional[bool] = False,
-    sync_exceptions: Optional[bool] = False,
+    check_errors: bool = False,
+    sync_exceptions: bool = False,
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 

From 562d5221310b896b2ac7d8427cc8d0c3e925358e Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 29 Mar 2023 15:03:16 +0200
Subject: [PATCH 28/41] Remove sync_exceptions flag, simplifying the behavior
 and relation with CUDA graphs. If check_errors=False (the default)
 getNeighborPairs does not check for errors and is compatible with graphs. If
 check_errors=True, the function raises if necessary but it is incompatible
 with graphs

---
 src/pytorch/neighbors/TestNeighbors.py        | 10 +--
 src/pytorch/neighbors/getNeighborPairs.py     | 18 +---
 src/pytorch/neighbors/getNeighborPairsCPU.cpp |  2 +-
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 87 +++----------------
 src/pytorch/neighbors/neighbors.cpp           |  2 +-
 5 files changed, 23 insertions(+), 96 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 932af259..dfe57a17 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -148,15 +148,11 @@ def test_too_many_neighbors(device, dtype):
     # 4 points result into 6 pairs, but there is a storage just for 4.
     positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
-        # checkErrors = True will throw due to exceeding neighbours
-        # syncExceptions = True makes  this exception catchable at the
-        # expense of performance (even when no error ocurred)
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True, sync_exceptions=True)
+        # checkErrors = True will raise due to exceeding neighbours
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
         pt.cuda.synchronize()
 
-    # checkErrors = False will never throw due to exceeding neighbours,
-    # but  will return  the number  of pairs  found.
-    # syncExceptions is ignored in this case
+    # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs
     neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)
     assert number_found_pairs == 6
 
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 2a86e432..4048bf14 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -7,8 +7,7 @@ def getNeighborPairs(
     cutoff: float,
     max_num_neighbors: int = -1,
     box_vectors: Optional[Tensor] = None,
-    check_errors: bool = False,
-    sync_exceptions: bool = False,
+    check_errors: bool = False
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 
@@ -54,15 +53,6 @@ def getNeighborPairs(
         If set to False the function does not raise due to a number of pairs larger than the maximum.
         If set to True, a RuntimeError will be raised in that case.
         Defaults to False.
-    sync_exceptions: bool, optional
-        If  set to  True the  function will  synchronize to  check for
-        errors  and  raise  an  exception  in  the  caller  thread  if
-        necessary.
-        If set  to False it  is possible  that an exception  raised by
-        this function cannot be catched and result in a crash.
-        This flag is ignored if check_errors is False.
-        This flag must be False for the getNeighborPairs operation to be CUDA graph compatible.
-        Defaults to False.
     Returns
     -------
     neighbors: `torch.Tensor`
@@ -88,14 +78,14 @@ def getNeighborPairs(
 
     Exceptions
     ----------
-    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=False.
+    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised if check_errors=True.
 
     Note
     ----
     The operation can be compatible with CUDA Grahps, i.e. the shapes of the output
     tensors are independed of the values of input tensors and no synchronizing operation is performed.
 
-    For this to be the case sync_exceptions must be False.
+    For this to be the case check_errors must be False
 
     The CUDA implementation returns the atom pairs in non-determinist order,
     if `max_num_neighbors > 0`.
@@ -147,6 +137,6 @@ def getNeighborPairs(
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
-        positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions
+        positions, cutoff, max_num_neighbors, box_vectors, check_errors
     )
     return neighbors, deltas, distances, number_found_pairs
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index f4de2d29..3c669534 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -102,7 +102,7 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
 TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
   m.impl("getNeighborPairs",
 	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
-	      const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){
+	      const Tensor& box_vectors, const bool &checkErrors){
 	       //The syncExceptions flag is ignored, this function always throws synchronously
 	       return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors);
 	 });
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index b734f3b2..8ee51b2b 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -28,8 +28,6 @@ template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t
 template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
 template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };
 
-__device__ __managed__ int32_t tooManyNeighborsErrorFlag; // Error flag for forward_kernel
-
 template <typename scalar_t> __global__ void forward_kernel(
     const int32_t num_all_pairs,
     const Accessor<scalar_t, 2> positions,
@@ -68,19 +66,15 @@ template <typename scalar_t> __global__ void forward_kernel(
     if (distance2 > cutoff2) return;
 
     const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1);
-    //If the maximum number of neighbours is surpassed encode the
-    //number of pairs found in a flag and exit
-    if(i_pair >= neighbors.size(1)){
-      atomicMin(&tooManyNeighborsErrorFlag, -i_pair);
-      return;
+    //We handle too many neighbors outside of the kernel
+    if(i_pair < neighbors.size(1)){
+      neighbors[0][i_pair] = row;
+      neighbors[1][i_pair] = column;
+      deltas[i_pair][0] = delta_x;
+      deltas[i_pair][1] = delta_y;
+      deltas[i_pair][2] = delta_z;
+      distances[i_pair] = sqrt_(distance2);
     }
-
-    neighbors[0][i_pair] = row;
-    neighbors[1][i_pair] = column;
-    deltas[i_pair][0] = delta_x;
-    deltas[i_pair][1] = delta_y;
-    deltas[i_pair][2] = delta_z;
-    distances[i_pair] = sqrt_(distance2);
 }
 
 template <typename scalar_t> __global__ void backward_kernel(
@@ -106,33 +100,6 @@ template <typename scalar_t> __global__ void backward_kernel(
     atomicAdd(&grad_positions[i_atom][i_comp], grad);
 }
 
-static std::exception_ptr tooManyNeighborsException = nullptr;
-// Checks the  too many  neighbors flag and  stores an  exception if
-// necessary to detail::tooManyNeighborsException.  This function is
-// intended to be launched via cudaLaunchHostFunc.
-// data  is a  void pointer  to a  std::tuple<int,bool>, storing  the
-// maximum number  of neighbors and  whether to throw  an uncatchable
-// exception here or store it for later.
-static void CUDART_CB checkTooManyNeighbors(void* data) {
-    int max_num_neighbors;
-    bool syncExceptions;
-    std::tie(max_num_neighbors, syncExceptions) = *static_cast<std::tuple<int, bool>*>(data);
-    // An exception thrown  in a stream callback is  not catchable (it
-    // runs in another thread), so we store it in an exception_ptr for
-    // it  to be  processed sometime  later in  the main  thread.  For
-    // performance   reasons,    the   exception   is    thrown   here
-    // asynchronously if the checkErrors flag is set to true
-    try {
-      const int tooMan = tooManyNeighborsErrorFlag;
-      TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors));
-    }
-    catch (...) {
-      if (!syncExceptions)
-	throw;
-      else
-	tooManyNeighborsException = std::current_exception();
-    }
-}
 
 static bool isStreamCapturing(cudaStream_t st) {
     cudaStreamCaptureStatus graphStatus;
@@ -147,15 +114,9 @@ public:
                                const Scalar& cutoff,
                                const Scalar& max_num_neighbors,
                                const Tensor& box_vectors,
-			       bool checkErrors,
-			       bool syncExceptions) {
+			       bool checkErrors) {
         const auto stream = getCurrentCUDAStream(positions.get_device());
         bool isCUDAGraphCapturing = isStreamCapturing(stream);
-        // Advice CUDA on expected usage of the error flag
-        if (!isCUDAGraphCapturing) {
-            cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
-            cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetReadMostly, 0);
-        }
         const CUDAStreamGuard guard(stream);
         TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
@@ -186,7 +147,6 @@ public:
         const Tensor deltas = full({num_pairs, 3}, NAN, options);
         const Tensor distances = full(num_pairs, NAN, options);
 
-	tooManyNeighborsErrorFlag = 0;
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
 	  const scalar_t cutoff_ = cutoff.to<scalar_t>();
 	  TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");
@@ -202,29 +162,10 @@ public:
                 get_accessor<scalar_t, 1>(distances),
                 get_accessor<scalar_t, 2>(box_vectors));
         });
-        // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs
+        // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs
 	if(checkErrors){
-	  static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors;
-	  static std::tuple<int, bool> h_fn_data;
-	  h_fn_data = {max_num_neighbors_, syncExceptions};
-	  cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data);
-	  // With chekErrors=false and syncExceptions=false the state of
-	  // the tooManyErrorsFlag is checked  and exceptions are thrown
-	  // asynchronously and  in a  way compatible with  CUDA graphs.
-	  // However,  this  way  of  throwing  an  exception  makes  it
-	  // uncatchable, crashing  the code.
-	  //If  checkErrors=true  the syncExceptions=true  an  explicit
-	  // synchronization barrier here forces  to throw the exception
-	  // from the main thread, making it catchable at the expense of
-	  // a performance penalty each time the function is called.
-	  //Otherwise, if  checkErrors=false, no exception is  thrown and
-	  //the user is  responsible to check if the number  of pairs is
-	  //too high
-	  if (syncExceptions) {
-            cudaStreamSynchronize(stream);
-            if (tooManyNeighborsException)
-	      std::rethrow_exception(tooManyNeighborsException);
-	  }
+	  int num_pairs = i_curr_pair.item<int32_t>();
+	  TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs));
 	}
         ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
@@ -266,9 +207,9 @@ public:
 TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
   m.impl("getNeighborPairs",
 	 [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
-	    const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){
+	    const Tensor& box_vectors, const bool &checkErrors){
 	     const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors,
-							 box_vectors, checkErrors, syncExceptions);
+							 box_vectors, checkErrors);
 	     return make_tuple(results[0], results[1], results[2], results[3]);
 	 });
 }
diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp
index d5588608..e5911907 100644
--- a/src/pytorch/neighbors/neighbors.cpp
+++ b/src/pytorch/neighbors/neighbors.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
 
 TORCH_LIBRARY(neighbors, m) {
-    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors, bool syncExceptions) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)");
+    m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)");
 }

From 751ee1295963a4ff43ea03bce585b3495cbf3b6e Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Wed, 29 Mar 2023 15:55:09 +0200
Subject: [PATCH 29/41] Remove unused function

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 8ee51b2b..5195b4ec 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -100,13 +100,6 @@ template <typename scalar_t> __global__ void backward_kernel(
     atomicAdd(&grad_positions[i_atom][i_comp], grad);
 }
 
-
-static bool isStreamCapturing(cudaStream_t st) {
-    cudaStreamCaptureStatus graphStatus;
-    cudaStreamIsCapturing(st, &graphStatus);
-    return graphStatus == cudaStreamCaptureStatusActive;
-}
-
 class Autograd : public Function<Autograd> {
 public:
     static tensor_list forward(AutogradContext* ctx,
@@ -116,7 +109,6 @@ public:
                                const Tensor& box_vectors,
 			       bool checkErrors) {
         const auto stream = getCurrentCUDAStream(positions.get_device());
-        bool isCUDAGraphCapturing = isStreamCapturing(stream);
         const CUDAStreamGuard guard(stream);
         TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");

From ad8bbaf0bfd20a3fb60cc8c4f02b1da8bd3b046a Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 09:58:09 +0200
Subject: [PATCH 30/41] Remove unnecessary synchronization in test

---
 src/pytorch/neighbors/TestNeighbors.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index dfe57a17..09aaec6c 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -150,7 +150,6 @@ def test_too_many_neighbors(device, dtype):
     with pytest.raises(RuntimeError):
         # checkErrors = True will raise due to exceeding neighbours
         getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
-        pt.cuda.synchronize()
 
     # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs
     neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)

From 659333171a6d6bd888a43e1f9f95d71455afbaac Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:00:46 +0200
Subject: [PATCH 31/41] Clarify documentation of check_errors

---
 src/pytorch/neighbors/getNeighborPairs.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 4048bf14..2209d174 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -50,9 +50,13 @@ def getNeighborPairs(
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
     check_errors: bool, optional
-        If set to False the function does not raise due to a number of pairs larger than the maximum.
-        If set to True, a RuntimeError will be raised in that case.
-        Defaults to False.
+        If True, a RuntimeError is raised if more than max_num_neighbors pairs are found.
+        The error checking requires synchronization, which adds cost and makes this function
+        incompatible with CUDA graphs. If this argument is False, no error checking is performed.
+        This makes it faster and compatible with CUDA graphs, but it is your responsibility
+        to check the return value for number_found_pairs to make sure that no neighbors were missed.
+        Default: False
+
     Returns
     -------
     neighbors: `torch.Tensor`

From 75608cfdd5adefa2ceae1711f9583add70c50738 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:05:43 +0200
Subject: [PATCH 32/41] Clarify documentation of number_found_pairs

---
 src/pytorch/neighbors/getNeighborPairs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 2209d174..24458728 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -76,9 +76,9 @@ def getNeighborPairs(
         the distance is set to `NaN`.
 
     number_found_pairs: `torch.Tensor`
-        Contains the total  number of pairs found,  which might exceed
-        the  requested  max_num_neighbors,  leaving the  rest  of  the
-        output in an undefined state.
+        Contains the total number of pairs found in an unspecified order,
+        which might exceed the  requested  max_num_neighbors. In this case, the first number_found_pairs
+        pairs in the output are valid pairs, but the remaining pairs are omitted.
 
     Exceptions
     ----------

From 4c624e5610d81c422d500c59b54d365e98290d73 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:07:04 +0200
Subject: [PATCH 33/41] Clarify documentation of CUDA graph functionality

---
 src/pytorch/neighbors/getNeighborPairs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 24458728..90fc6cdf 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -86,10 +86,10 @@ def getNeighborPairs(
 
     Note
     ----
-    The operation can be compatible with CUDA Grahps, i.e. the shapes of the output
-    tensors are independed of the values of input tensors and no synchronizing operation is performed.
-
-    For this to be the case check_errors must be False
+    The operation  can be compatible  with CUDA Graphs: the  shapes of
+    the output tensors are independent of the values of input tensors,
+    and  no  synchronization  is  performed.
+    For  this  to  be  true, check_errors must be False.
 
     The CUDA implementation returns the atom pairs in non-determinist order,
     if `max_num_neighbors > 0`.

From 355860f2e04034223a8d091bb5083678818a8a54 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:07:30 +0200
Subject: [PATCH 34/41] Remove obsolete comment

---
 src/pytorch/neighbors/getNeighborPairsCPU.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index 3c669534..44db772b 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -103,7 +103,6 @@ TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
   m.impl("getNeighborPairs",
 	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
 	      const Tensor& box_vectors, const bool &checkErrors){
-	       //The syncExceptions flag is ignored, this function always throws synchronously
 	       return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors);
 	 });
 }

From 5ccc98f35a2254104ae701e61519444645d72449 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:12:04 +0200
Subject: [PATCH 35/41] Fix formatting

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 5195b4ec..d00645a3 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -67,13 +67,13 @@ template <typename scalar_t> __global__ void forward_kernel(
 
     const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1);
     //We handle too many neighbors outside of the kernel
-    if(i_pair < neighbors.size(1)){
-      neighbors[0][i_pair] = row;
-      neighbors[1][i_pair] = column;
-      deltas[i_pair][0] = delta_x;
-      deltas[i_pair][1] = delta_y;
-      deltas[i_pair][2] = delta_z;
-      distances[i_pair] = sqrt_(distance2);
+    if (i_pair < neighbors.size(1)) {
+        neighbors[0][i_pair] = row;
+        neighbors[1][i_pair] = column;
+        deltas[i_pair][0] = delta_x;
+        deltas[i_pair][1] = delta_y;
+        deltas[i_pair][2] = delta_z;
+        distances[i_pair] = sqrt_(distance2);
     }
 }
 

From bc78d15ab6071f4789bd53b01662b96a5b57aeea Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 10:12:45 +0200
Subject: [PATCH 36/41] Fix formatting

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index d00645a3..a4e5dd3a 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -155,10 +155,10 @@ public:
                 get_accessor<scalar_t, 2>(box_vectors));
         });
         // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs
-	if(checkErrors){
-	  int num_pairs = i_curr_pair.item<int32_t>();
-	  TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs));
-	}
+        if (checkErrors) {
+            int num_pairs = i_curr_pair.item<int32_t>();
+            TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs));
+        }
         ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
         return {neighbors, deltas, distances, i_curr_pair};

From e1a965ae3c95aef31358b3df723f92cd6648d8d9 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 11:14:05 +0200
Subject: [PATCH 37/41] Update documentation

---
 src/pytorch/neighbors/getNeighborPairs.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 90fc6cdf..2886b5a7 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -77,8 +77,9 @@ def getNeighborPairs(
 
     number_found_pairs: `torch.Tensor`
         Contains the total number of pairs found in an unspecified order,
-        which might exceed the  requested  max_num_neighbors. In this case, the first number_found_pairs
-        pairs in the output are valid pairs, but the remaining pairs are omitted.
+        which might exceed the  requested  max_num_neighbors*num_atoms.
+        In this case, the first number_found_pairs pairs in the output are
+        valid pairs, but the remaining pairs are omitted.
 
     Exceptions
     ----------

From 130b13bed8579a098df2b1da2700e2a3d43f6e2c Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 11:14:45 +0200
Subject: [PATCH 38/41] Change the (misleading) num_pairs variable name to
 max_num_pairs. Enforce that the found number of pairs is less than num_pairs

---
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index a4e5dd3a..3b3d8073 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -128,16 +128,16 @@ public:
         const bool store_all_pairs = max_num_neighbors_ == -1;
         const int num_atoms = positions.size(0);
         const int num_all_pairs = num_atoms * (num_atoms - 1) / 2;
-        const int num_pairs = store_all_pairs ? num_all_pairs : num_atoms * max_num_neighbors_;
+        const int max_num_pairs = store_all_pairs ? num_all_pairs : (num_atoms * max_num_neighbors_);
 
         const int num_threads = 128;
         const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1);
 
         const TensorOptions options = positions.options();
         const Tensor i_curr_pair = zeros(1, options.dtype(kInt32));
-        const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32));
-        const Tensor deltas = full({num_pairs, 3}, NAN, options);
-        const Tensor distances = full(num_pairs, NAN, options);
+        const Tensor neighbors = full({2, max_num_pairs}, -1, options.dtype(kInt32));
+        const Tensor deltas = full({max_num_pairs, 3}, NAN, options);
+        const Tensor distances = full(max_num_pairs, NAN, options);
 
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
 	  const scalar_t cutoff_ = cutoff.to<scalar_t>();
@@ -156,8 +156,8 @@ public:
         });
         // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs
         if (checkErrors) {
-            int num_pairs = i_curr_pair.item<int32_t>();
-            TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs));
+            int num_found_pairs = i_curr_pair.item<int32_t>();
+            TORCH_CHECK(num_found_pairs < max_num_pairs, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs), " but found " + std::to_string(num_found_pairs));
         }
         ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;

From 2d8d02b3dadb50c222540fa0d84c6959fe98c5e6 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Fri, 31 Mar 2023 11:15:40 +0200
Subject: [PATCH 39/41] Add test that checks if the max_num_neighbors per
 particle is enforced. Right now this does not pass, since the function allows
 that an atom has more neighbors than max_num_neighbors as long as
 num_found_pairs<num_atoms*max_num_neighbors

---
 src/pytorch/neighbors/TestNeighbors.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index 09aaec6c..c0a0f615 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -155,6 +155,19 @@ def test_too_many_neighbors(device, dtype):
     neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)
     assert number_found_pairs == 6
 
+@pytest.mark.parametrize('device', ['cpu', 'cuda'])
+@pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
+def test_max_neighbors_means_per_particle(device, dtype):
+    if not pt.cuda.is_available() and device == 'cuda':
+        pytest.skip('No GPU')
+    # 4 points result into 6 pairs.
+    positions = pt.zeros((4, 3,), device=device, dtype=dtype)
+    with pytest.raises(RuntimeError):
+        # checkErrors = True should raise due to exceeding neighbours
+        # As of now this will not raise, since 6<num_atoms*max_num_neighbors=8
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=2, check_errors=True)
+    #This should not fail, there are 4 neighbors per particle
+    getNeighborPairs(positions, cutoff=1, max_num_neighbors=4, check_errors=True)
 
 def test_is_cuda_graph_compatible():
     if not pt.cuda.is_available():

From 90a584e2190f398d5d6b920bd633f75ea476a0b9 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Tue, 4 Apr 2023 11:05:22 +0200
Subject: [PATCH 40/41] Change the meaning and name from max_num_neighbors
 (maximum number of neighbors per particle) to max_num_pairs (maximum number
 of total pairs).

---
 src/pytorch/neighbors/TestNeighbors.py        | 28 +++++++--------
 src/pytorch/neighbors/getNeighborPairs.py     | 36 +++++++++----------
 src/pytorch/neighbors/getNeighborPairsCPU.cpp | 18 +++++-----
 src/pytorch/neighbors/getNeighborPairsCUDA.cu | 23 ++++++------
 4 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
index c0a0f615..7d830812 100644
--- a/src/pytorch/neighbors/TestNeighbors.py
+++ b/src/pytorch/neighbors/TestNeighbors.py
@@ -56,10 +56,10 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
 
     # Find the number of neighbors
     num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances)))
-    max_num_neighbors = -1 if all_pairs else max(int(np.ceil(num_neighbors / num_atoms)), 1)
+    max_num_pairs = -1 if all_pairs else max(int(num_neighbors), 1)
 
     # Compute results
-    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_pairs=max_num_pairs)
 
     # Check device
     assert neighbors.device == positions.device
@@ -83,7 +83,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
         neighbors, deltas, distances = sort_neighbors(neighbors, deltas, distances)
 
         # Resize the reference
-        ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, num_atoms * max_num_neighbors)
+        ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, max_num_pairs)
 
     assert np.all(ref_neighbors == neighbors)
     assert np.allclose(ref_deltas, deltas, equal_nan=True)
@@ -149,25 +149,23 @@ def test_too_many_neighbors(device, dtype):
     positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
         # checkErrors = True will raise due to exceeding neighbours
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
+        getNeighborPairs(positions, cutoff=1, max_num_pairs=1, check_errors=True)
 
     # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs
-    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)
+    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_pairs=1, check_errors=False)
     assert number_found_pairs == 6
 
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
-def test_max_neighbors_means_per_particle(device, dtype):
+def test_max_pairs_means_total(device, dtype):
     if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
     # 4 points result into 6 pairs.
     positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
         # checkErrors = True should raise due to exceeding neighbours
-        # As of now this will not raise, since 6<num_atoms*max_num_neighbors=8
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=2, check_errors=True)
-    #This should not fail, there are 4 neighbors per particle
-    getNeighborPairs(positions, cutoff=1, max_num_neighbors=4, check_errors=True)
+        getNeighborPairs(positions, cutoff=1, max_num_pairs=5, check_errors=True)
+    getNeighborPairs(positions, cutoff=1, max_num_pairs=6, check_errors=True)
 
 def test_is_cuda_graph_compatible():
     if not pt.cuda.is_available():
@@ -198,11 +196,11 @@ def test_is_cuda_graph_compatible():
     s.wait_stream(pt.cuda.current_stream())
     with pt.cuda.stream(s):
         for _ in range(3):
-            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_pairs=num_neighbors+1)
     pt.cuda.synchronize()
 
     with pt.cuda.graph(graph):
-        neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+        neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_pairs=num_neighbors+1)
 
     graph.replay()
     pt.cuda.synchronize()
@@ -239,10 +237,10 @@ def test_periodic_neighbors(device, dtype):
 
     # Find the number of neighbors
     num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances)))
-    max_num_neighbors = max(int(np.ceil(num_neighbors / num_atoms)), 1)
+    max_num_pairs = max(int(num_neighbors), 1)
 
     # Compute results
-    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_pairs=max_num_pairs, box_vectors=box_vectors)
 
     # Check device
     assert neighbors.device == positions.device
@@ -265,7 +263,7 @@ def test_periodic_neighbors(device, dtype):
     neighbors, deltas, distances = sort_neighbors(neighbors, deltas, distances)
 
     # Resize the reference
-    ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, num_atoms * max_num_neighbors)
+    ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, max_num_pairs)
 
     assert np.all(ref_neighbors == neighbors)
     assert np.allclose(ref_deltas, deltas, equal_nan=True)
diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 2886b5a7..2538b50d 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -5,21 +5,20 @@
 def getNeighborPairs(
     positions: Tensor,
     cutoff: float,
-    max_num_neighbors: int = -1,
+    max_num_pairs: int = -1,
     box_vectors: Optional[Tensor] = None,
     check_errors: bool = False
 ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Returns indices and distances of atom pairs within a given cutoff distance.
 
-    If `max_num_neighbors == -1` (default), all the atom pairs are returned,
+    If `max_num_pairs == -1` (default), all the atom pairs are returned,
     i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for
     the small molecules, where almost all the atoms are within the cutoff
     distance of each other.
 
-    If `max_num_neighbors > 0`, a fixed number of the atom pair are returned,
-    i.e. `num_pairs = num_atoms * max_num_neighbors`. This is indeded for large
-    molecule, where most of the atoms are beyond the cutoff distance of each
-    other.
+    If  `max_num_pairs >  0`, a  fixed number  of the  atom pairs  are
+    returned.  This is  indeded for large molecule, where  most of the
+    atoms are beyond the cutoff distance of each other.
 
     This function optionally supports periodic boundary conditions with
     arbitrary triclinic boxes.  The box vectors `a`, `b`, and `c` must satisfy
@@ -42,15 +41,15 @@ def getNeighborPairs(
         data type has to be`torch.float32` or `torch.float64`.
     cutoff: float
         Maximum distance between atom pairs.
-    max_num_neighbors: int, optional
-        Maximum number of neighbors per atom. If set to `-1` (default),
+    max_num_pairs: int, optional
+        Maximum number of pairs (total number of neighbors). If set to `-1` (default),
         all possible combinations of atom pairs are included.
     box_vectors: `torch.Tensor`, optional
         The vectors defining the periodic box.  This must have shape `(3, 3)`,
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
     check_errors: bool, optional
-        If True, a RuntimeError is raised if more than max_num_neighbors pairs are found.
+        If True, a RuntimeError is raised if more than max_num_pairs pairs are found.
         The error checking requires synchronization, which adds cost and makes this function
         incompatible with CUDA graphs. If this argument is False, no error checking is performed.
         This makes it faster and compatible with CUDA graphs, but it is your responsibility
@@ -76,14 +75,15 @@ def getNeighborPairs(
         the distance is set to `NaN`.
 
     number_found_pairs: `torch.Tensor`
-        Contains the total number of pairs found in an unspecified order,
-        which might exceed the  requested  max_num_neighbors*num_atoms.
-        In this case, the first number_found_pairs pairs in the output are
-        valid pairs, but the remaining pairs are omitted.
+        Contains the  total number  of pairs found.  Be aware  that if
+        check_errors   is   False,   this   might   be   larger   than
+        max_num_pairs. In  that case,  the output  tensors contain
+        only a subset of the pairs that were found, and the others are
+        omitted. Which pairs get omitted may vary between invocations.
 
     Exceptions
     ----------
-    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised if check_errors=True.
+    If `max_num_pairs > 0` and too small, `RuntimeError` is raised if check_errors=True.
 
     Note
     ----
@@ -93,7 +93,7 @@ def getNeighborPairs(
     For  this  to  be  true, check_errors must be False.
 
     The CUDA implementation returns the atom pairs in non-determinist order,
-    if `max_num_neighbors > 0`.
+    if `max_num_pairs > 0`.
 
 
     Examples
@@ -119,7 +119,7 @@ def getNeighborPairs(
              [1., 0., 0.]]),
      tensor([1., nan, 1.]), tensor([3], dtype=torch.int32))
 
-    >>> getNeighborPairs(positions, cutoff=3.0, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
+    >>> getNeighborPairs(positions, cutoff=3.0, max_num_pairs=6) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2,  2, -1, -1, -1],
             [ 0,  0,  1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
             [2., 0., 0.],
@@ -128,7 +128,7 @@ def getNeighborPairs(
             [nan, nan, nan],
             [nan, nan, nan]]), tensor([1., 2., 1., nan, nan, nan]), tensor([6], dtype=torch.int32))
 
-    >>> getNeighborPairs(positions, cutoff=1.5, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
+    >>> getNeighborPairs(positions, cutoff=1.5, max_num_pairs=6) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2, -1, -1, -1, -1],
              [ 0,  1, -1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
             [1., 0., 0.],
@@ -142,6 +142,6 @@ def getNeighborPairs(
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
     neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
-        positions, cutoff, max_num_neighbors, box_vectors, check_errors
+        positions, cutoff, max_num_pairs, box_vectors, check_errors
     )
     return neighbors, deltas, distances, number_found_pairs
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
index 44db772b..d63e24b2 100644
--- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp
+++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -18,7 +18,7 @@ using torch::round;
 
 static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
 						     const Scalar& cutoff,
-						     const Scalar& max_num_neighbors,
+						     const Scalar& max_num_pairs,
 						     const Tensor& box_vectors,
 						     bool checkErrors) {
 
@@ -48,9 +48,9 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
         TORCH_CHECK(v[1][1] >= 2*v[2][1], "Invalid box vectors: box_vectors[1][1] < 2*box_vectors[2][1]");
     }
 
-    const int max_num_neighbors_ = max_num_neighbors.to<int>();
-    TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1,
-        "Expected \"max_num_neighbors\" to be positive or equal to -1");
+    const int max_num_pairs_ = max_num_pairs.to<int>();
+    TORCH_CHECK(max_num_pairs_ > 0 || max_num_pairs_ == -1,
+        "Expected \"max_num_pairs\" to be positive or equal to -1");
 
     const int num_atoms = positions.size(0);
     const int num_pairs = num_atoms * (num_atoms - 1) / 2;
@@ -69,7 +69,7 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
     }
     Tensor distances = frobenius_norm(deltas, 1);
 
-    if (max_num_neighbors_ == -1) {
+    if (max_num_pairs_ == -1) {
         const Tensor mask = distances > cutoff;
         neighbors.index_put_({Slice(), mask}, -1);
         deltas = deltas.clone(); // Break an autograd loop
@@ -83,10 +83,10 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
         deltas = deltas.index({mask, Slice()});
         distances = distances.index({mask});
 
-        const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0);
+        const int num_pad = max_num_pairs_ - distances.size(0);
         if (checkErrors) {
             TORCH_CHECK(num_pad >= 0,
-                "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
+                "The maximum number of pairs has been exceed! Increase \"max_num_pairs\"");
         }
         if (num_pad > 0) {
             neighbors = hstack({neighbors, full({2, num_pad}, -1, neighbors.options())});
@@ -101,8 +101,8 @@ static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
 
 TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
   m.impl("getNeighborPairs",
-	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
+	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_pairs,
 	      const Tensor& box_vectors, const bool &checkErrors){
-	       return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors);
+	       return forward(positions, cutoff, max_num_pairs, box_vectors, checkErrors);
 	 });
 }
diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
index 3b3d8073..23540af6 100644
--- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu
+++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu
@@ -105,7 +105,7 @@ public:
     static tensor_list forward(AutogradContext* ctx,
                                const Tensor& positions,
                                const Scalar& cutoff,
-                               const Scalar& max_num_neighbors,
+                               const Scalar& max_num_pairs,
                                const Tensor& box_vectors,
 			       bool checkErrors) {
         const auto stream = getCurrentCUDAStream(positions.get_device());
@@ -114,9 +114,8 @@ public:
         TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
         TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3");
         TORCH_CHECK(positions.is_contiguous(), "Expected \"positions\" to be contiguous");
-	int max_num_neighbors_ = max_num_neighbors.to<int>();
-        TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1,
-            "Expected \"max_num_neighbors\" to be positive or equal to -1");
+        TORCH_CHECK(max_num_pairs.toInt() > 0 || max_num_pairs.toInt() == -1,
+            "Expected \"max_num_pairs\" to be positive or equal to -1");
 
         const bool use_periodic = (box_vectors.size(0) != 0);
         if (use_periodic) {
@@ -125,19 +124,19 @@ public:
         }
 
         // Decide the algorithm
-        const bool store_all_pairs = max_num_neighbors_ == -1;
+        const bool store_all_pairs = max_num_pairs.toInt() == -1;
         const int num_atoms = positions.size(0);
         const int num_all_pairs = num_atoms * (num_atoms - 1) / 2;
-        const int max_num_pairs = store_all_pairs ? num_all_pairs : (num_atoms * max_num_neighbors_);
+	const int max_num_pairs_ = store_all_pairs ? num_all_pairs : (max_num_pairs.toInt());
 
         const int num_threads = 128;
         const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1);
 
         const TensorOptions options = positions.options();
         const Tensor i_curr_pair = zeros(1, options.dtype(kInt32));
-        const Tensor neighbors = full({2, max_num_pairs}, -1, options.dtype(kInt32));
-        const Tensor deltas = full({max_num_pairs, 3}, NAN, options);
-        const Tensor distances = full(max_num_pairs, NAN, options);
+        const Tensor neighbors = full({2, max_num_pairs_}, -1, options.dtype(kInt32));
+        const Tensor deltas = full({max_num_pairs_, 3}, NAN, options);
+        const Tensor distances = full(max_num_pairs_, NAN, options);
 
         AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
 	  const scalar_t cutoff_ = cutoff.to<scalar_t>();
@@ -157,7 +156,7 @@ public:
         // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs
         if (checkErrors) {
             int num_found_pairs = i_curr_pair.item<int32_t>();
-            TORCH_CHECK(num_found_pairs < max_num_pairs, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs), " but found " + std::to_string(num_found_pairs));
+            TORCH_CHECK(num_found_pairs <= max_num_pairs_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs_), " but found " + std::to_string(num_found_pairs));
         }
         ctx->save_for_backward({neighbors, deltas, distances});
         ctx->saved_data["num_atoms"] = num_atoms;
@@ -198,9 +197,9 @@ public:
 
 TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
   m.impl("getNeighborPairs",
-	 [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
+	 [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_pairs,
 	    const Tensor& box_vectors, const bool &checkErrors){
-	     const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors,
+	     const tensor_list results = Autograd::apply(positions, cutoff, max_num_pairs,
 							 box_vectors, checkErrors);
 	     return make_tuple(results[0], results[1], results[2], results[3]);
 	 });

From c97a6f22d8dd596da1258e01409a905fdc525082 Mon Sep 17 00:00:00 2001
From: RaulPPealez <raulppelaez@gmail.com>
Date: Tue, 11 Apr 2023 13:49:53 +0200
Subject: [PATCH 41/41] Fix typo in comment

---
 src/pytorch/neighbors/getNeighborPairs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
index 2538b50d..12a4b03c 100644
--- a/src/pytorch/neighbors/getNeighborPairs.py
+++ b/src/pytorch/neighbors/getNeighborPairs.py
@@ -17,7 +17,7 @@ def getNeighborPairs(
     distance of each other.
 
     If  `max_num_pairs >  0`, a  fixed number  of the  atom pairs  are
-    returned.  This is  indeded for large molecule, where  most of the
+    returned.  This is  intended for large molecule, where  most of the
     atoms are beyond the cutoff distance of each other.
 
     This function optionally supports periodic boundary conditions with