openmm · raimis · Apr 14, 2023 · Jan 16, 2023 · Jan 19, 2023 · Jan 19, 2023
diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py
@@ -59,7 +59,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
     max_num_neighbors = -1 if all_pairs else max(int(np.ceil(num_neighbors / num_atoms)), 1)
 
     # Compute results
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors)
 
     # Check device
     assert neighbors.device == positions.device
@@ -94,7 +94,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs):
 @pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100, 1000])
 @pytest.mark.parametrize('grad', ['deltas', 'distances', 'combined'])
 def test_neighbor_grads(device, dtype, num_atoms, grad):
-    
+
     if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
 
@@ -114,8 +114,8 @@ def test_neighbor_grads(device, dtype, num_atoms, grad):
     # Compute values using NNPOps
     positions.requires_grad_(True)
     print(positions)
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff)
-    
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff)
+
     assert pt.all(neighbors > -1)
     assert pt.all(neighbors == ref_neighbors)
     assert pt.allclose(deltas, ref_deltas)
@@ -133,29 +133,69 @@ def test_neighbor_grads(device, dtype, num_atoms, grad):
         (deltas.sum() + distances.sum()).backward()
     else:
         raise ValueError('grad')
-    
+
     if dtype == pt.float32:
         assert pt.allclose(ref_positions.grad, positions.grad, atol=1e-3, rtol=1e-3)
     else:
         assert pt.allclose(ref_positions.grad, positions.grad, atol=1e-8, rtol=1e-5)
 
 
-# The following test is only run on the CPU.  Running it on the GPU triggers a
-# CUDA assertion, which causes all tests run after it to fail.
-
-@pytest.mark.parametrize('device', ['cpu'])
+@pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_too_many_neighbors(device, dtype):
-
     if not pt.cuda.is_available() and device == 'cuda':
         pytest.skip('No GPU')
-
     # 4 points result into 6 pairs, but there is a storage just for 4.
+    positions = pt.zeros((4, 3,), device=device, dtype=dtype)
     with pytest.raises(RuntimeError):
-        positions = pt.zeros((4, 3,), device=device, dtype=dtype)
-        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1)
+        # checkErrors = True will raise due to exceeding neighbours
+        getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True)
         pt.cuda.synchronize()
 
+    # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs
+    neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False)
+    assert number_found_pairs == 6
+
+
+def test_is_cuda_graph_compatible():
+    if not pt.cuda.is_available():
+        pytest.skip('No GPU')
+    device = 'cuda'
+    dtype = pt.float32
+    num_atoms = 100
+    # Generate random positions
+    positions = 10 * pt.randn((num_atoms, 3), device=device, dtype=dtype)
+    cutoff = 5
+    # Get neighbor pairs
+    ref_neighbors = np.vstack(np.tril_indices(num_atoms, -1))
+    ref_positions = positions.cpu().numpy()
+    ref_deltas = ref_positions[ref_neighbors[0]] - ref_positions[ref_neighbors[1]]
+    ref_distances = np.linalg.norm(ref_deltas, axis=1)
+
+    # Filter the neighbor pairs
+    mask = ref_distances > cutoff
+    ref_neighbors[:, mask] = -1
+    ref_deltas[mask, :] = np.nan
+    ref_distances[mask] = np.nan
+
+    # Find the number of neighbors
+    num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances)))
+
+    graph = pt.cuda.CUDAGraph()
+    s = pt.cuda.Stream()
+    s.wait_stream(pt.cuda.current_stream())
+    with pt.cuda.stream(s):
+        for _ in range(3):
+            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+    pt.cuda.synchronize()
+
+    with pt.cuda.graph(graph):
+        neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1)
+
+    graph.replay()
+    pt.cuda.synchronize()
+
+
 @pytest.mark.parametrize('device', ['cpu', 'cuda'])
 @pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
 def test_periodic_neighbors(device, dtype):
@@ -190,7 +230,7 @@ def test_periodic_neighbors(device, dtype):
     max_num_neighbors = max(int(np.ceil(num_neighbors / num_atoms)), 1)
 
     # Compute results
-    neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors)
+    neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors)
 
     # Check device
     assert neighbors.device == positions.device
@@ -228,7 +268,7 @@ class ForceModule(pt.nn.Module):
 
         def forward(self, positions):
 
-            neighbors, deltas, distances = getNeighborPairs(positions, cutoff=1.0)
+            neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=1.0)
             mask = pt.isnan(distances)
             distances = distances[~mask]
             return pt.sum(distances**2)

diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py
@@ -2,9 +2,14 @@
 from typing import Optional, Tuple
 
 
-def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]:
-    '''
-    Returns indices and distances of atom pairs within a given cutoff distance.
+def getNeighborPairs(
+    positions: Tensor,
+    cutoff: float,
+    max_num_neighbors: int = -1,
+    box_vectors: Optional[Tensor] = None,
+    check_errors: bool = False
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Returns indices and distances of atom pairs within a given cutoff distance.
 
     If `max_num_neighbors == -1` (default), all the atom pairs are returned,
     i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for
@@ -44,7 +49,10 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
         The vectors defining the periodic box.  This must have shape `(3, 3)`,
         where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`.
         If this is omitted, periodic boundary conditions are not applied.
-
+    check_errors: bool, optional
+        If set to False the function does not raise due to a number of pairs larger than the maximum.
+        If set to True, a RuntimeError will be raised in that case.
+        Defaults to False.
     Returns
     -------
     neighbors: `torch.Tensor`
@@ -63,18 +71,26 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
         If an atom pair is separated by a larger distance than the cutoff,
         the distance is set to `NaN`.
 
+    number_found_pairs: `torch.Tensor`
+        Contains the total  number of pairs found,  which might exceed
+        the  requested  max_num_neighbors,  leaving the  rest  of  the
+        output in an undefined state.
+
     Exceptions
     ----------
-    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised.
+    If `max_num_neighbors > 0` and too small, `RuntimeError` is raised if check_errors=True.
 
     Note
     ----
-    The operation is compatible with CUDA Grahps, i.e. the shapes of the output
-    tensors are independed of the values of input tensors.
+    The operation can be compatible with CUDA Grahps, i.e. the shapes of the output
+    tensors are independed of the values of input tensors and no synchronizing operation is performed.
+
+    For this to be the case check_errors must be False
 
     The CUDA implementation returns the atom pairs in non-determinist order,
     if `max_num_neighbors > 0`.
 
+
     Examples
     --------
     >>> import torch as pt
@@ -88,39 +104,39 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int =
      tensor([[1., 0., 0.],
              [2., 0., 0.],
              [1., 0., 0.]]),
-     tensor([1., 2., 1.]))
+     tensor([1., 2., 1.]), tensor([3], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=1.5) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1, -1,  2],
              [ 0, -1,  1]], dtype=torch.int32),
      tensor([[1., 0., 0.],
              [nan, nan, nan],
              [1., 0., 0.]]),
-     tensor([1., nan, 1.]))
+     tensor([1., nan, 1.]), tensor([3], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=3.0, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2,  2, -1, -1, -1],
-             [ 0,  0,  1, -1, -1, -1]], dtype=torch.int32),
-     tensor([[1., 0., 0.],
-             [2., 0., 0.],
-             [1., 0., 0.],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan]]),
-     tensor([1., 2., 1., nan, nan, nan]))
+            [ 0,  0,  1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
+            [2., 0., 0.],
+            [1., 0., 0.],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan]]), tensor([1., 2., 1., nan, nan, nan]), tensor([6], dtype=torch.int32))
 
     >>> getNeighborPairs(positions, cutoff=1.5, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE
     (tensor([[ 1,  2, -1, -1, -1, -1],
-             [ 0,  1, -1, -1, -1, -1]], dtype=torch.int32),
-     tensor([[1., 0., 0.],
-             [1., 0., 0.],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan],
-             [nan, nan, nan]]),
-     tensor([1., 1., nan, nan, nan, nan]))
-    '''
+             [ 0,  1, -1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.],
+            [1., 0., 0.],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan],
+            [nan, nan, nan]]), tensor([1., 1., nan, nan, nan, nan]), tensor([6], dtype=torch.int32))
+
+    """
 
     if box_vectors is None:
         box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype)
-    return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors)
+    neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(
+        positions, cutoff, max_num_neighbors, box_vectors, check_errors
+    )
+    return neighbors, deltas, distances, number_found_pairs
diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp
@@ -16,10 +16,11 @@ using torch::Tensor;
 using torch::outer;
 using torch::round;
 
-static tuple<Tensor, Tensor, Tensor> forward(const Tensor& positions,
-                                             const Scalar& cutoff,
-                                             const Scalar& max_num_neighbors,
-                                             const Tensor& box_vectors) {
+static tuple<Tensor, Tensor, Tensor, Tensor> forward(const Tensor& positions,
+						     const Scalar& cutoff,
+						     const Scalar& max_num_neighbors,
+						     const Tensor& box_vectors,
+						     bool checkErrors) {
 
     TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions");
     TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0");
@@ -83,19 +84,26 @@ static tuple<Tensor, Tensor, Tensor> forward(const Tensor& positions,
         distances = distances.index({mask});
 
         const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0);
-        TORCH_CHECK(num_pad >= 0,
-            "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
-
+        if (checkErrors) {
+            TORCH_CHECK(num_pad >= 0,
+                "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\"");
+        }
         if (num_pad > 0) {
             neighbors = hstack({neighbors, full({2, num_pad}, -1, neighbors.options())});
             deltas = vstack({deltas, full({num_pad, 3}, NAN, deltas.options())});
             distances = hstack({distances, full({num_pad}, NAN, distances.options())});
         }
     }
-
-    return {neighbors, deltas, distances};
+    Tensor num_pairs_found = torch::empty(1, indices.options().dtype(kInt32));
+    num_pairs_found[0] = distances.size(0);
+    return {neighbors, deltas, distances, num_pairs_found};
 }
 
 TORCH_LIBRARY_IMPL(neighbors, CPU, m) {
-    m.impl("getNeighborPairs", &forward);
-}
+  m.impl("getNeighborPairs",
+	   [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors,
+	      const Tensor& box_vectors, const bool &checkErrors){
+	       //The syncExceptions flag is ignored, this function always throws synchronously
+	       return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors);
+	 });
+}