From 657e7486132b9b49a3187c73fd36a048bb038bea Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Mon, 16 Jan 2023 17:18:46 +0100 Subject: [PATCH 01/41] Add error checking to CUDA version of getNeighborPairs --- src/pytorch/neighbors/TestNeighbors.py | 2 +- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 28 +++++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 2b1365ab..7ec159ae 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -135,7 +135,7 @@ def test_neighbor_grads(dtype, num_atoms, grad): # The following test is only run on the CPU. Running it on the GPU triggers a # CUDA assertion, which causes all tests run after it to fail. -@pytest.mark.parametrize('device', ['cpu']) +@pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) def test_too_many_neighbors(device, dtype): diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 2d820a4a..63ea9454 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -26,6 +26,8 @@ template __device__ __forceinline__ scalar_t sqrt_(scalar_t template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); }; template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); }; +__device__ __managed__ int32_t tooManyNeighborsErrorFlag; //Error flag for forward_kernel + template __global__ void forward_kernel( const int32_t num_all_pairs, const Accessor positions, @@ -64,7 +66,12 @@ template __global__ void forward_kernel( if (distance2 > cutoff2) return; const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1); - assert(i_pair < neighbors.size(1)); + //If the maximum number of neighbours is surpassed encode the + //number of pairs found in a flag and exit + if(i_pair >= neighbors.size(1)){ + atomicMin(&tooManyNeighborsErrorFlag, -i_pair); + return; + } neighbors[0][i_pair] = row; neighbors[1][i_pair] = column; @@ -135,11 +142,19 @@ public: const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32)); const Tensor deltas = full({num_pairs, 3}, NAN, options); const Tensor distances = full(num_pairs, NAN, options); - + //Advice CUDA on expected usage of the error flag + //cudaStreamAttachMemAsync(stream, &tooManyNeighborsErrorFlag); + cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), + cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); + cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), + cudaMemAdviseSetReadMostly, 0); AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { const CUDAStreamGuard guard(stream); + tooManyNeighborsErrorFlag = 0; const scalar_t cutoff_ = cutoff.to(); TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive"); + cudaEvent_t event; + cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync); forward_kernel<<>>( num_all_pairs, get_accessor(positions), @@ -151,6 +166,13 @@ public: get_accessor(deltas), get_accessor(distances), get_accessor(box_vectors)); + cudaEventRecord(event, stream); + cudaEventSynchronize(event); + //Check the error flag + TORCH_CHECK(tooManyNeighborsErrorFlag == 0, "Some particle has too many neighbours, found " + + std::to_string(-tooManyNeighborsErrorFlag) + " but max is " + + std::to_string(max_num_neighbors.toInt())); + cudaEventDestroy(event); }); ctx->save_for_backward({neighbors, deltas, distances}); @@ -197,4 +219,4 @@ TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) { const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, box_vectors); return make_tuple(results[0], results[1], results[2]); }); -} \ No newline at end of file +} From 53395568cb96684a05027ad69cf0098172a1bf10 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 19 Jan 2023 14:55:09 +0100 Subject: [PATCH 02/41] Add a new bool optional parameter to getNeighborPairs, setting it to true will force the function to synchronize and throw an exception if some error was found, so it can be catched. The default will throw the error asynchronously, which will crash the program. In both cases a meaningful message is printed. --- src/pytorch/neighbors/TestNeighbors.py | 4 +- src/pytorch/neighbors/getNeighborPairs.py | 13 ++- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 9 +- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 93 ++++++++++++++----- src/pytorch/neighbors/neighbors.cpp | 4 +- 5 files changed, 91 insertions(+), 32 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 7ec159ae..2391f648 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -145,9 +145,11 @@ def test_too_many_neighbors(device, dtype): # 4 points result into 6 pairs, but there is a storage just for 4. with pytest.raises(RuntimeError): positions = pt.zeros((4, 3,), device=device, dtype=dtype) - getNeighborPairs(positions, cutoff=1, max_num_neighbors=1) + # Omitting check_errors makes the exception non-catchable + getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) pt.cuda.synchronize() + @pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) def test_periodic_neighbors(device, dtype): diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 55d20d48..5b8a0f29 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple -def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: +def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False) -> Tuple[Tensor, Tensor]: ''' Returns indices and distances of atom pairs within a given cutoff distance. @@ -44,6 +44,12 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = The vectors defining the periodic box. This must have shape `(3, 3)`, where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`. If this is omitted, periodic boundary conditions are not applied. + check_errors: bool, optional + If set to True the function is guaranteed to throw if an error + is encountered, synchronizing if necessary. + If set to False, an error arising in this function might crash + the code at some point after calling it. + Defaults to False. Returns ------- @@ -75,6 +81,8 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = The CUDA implementation returns the atom pairs in non-determinist order, if `max_num_neighbors > 0`. + The check_errors argument is forced to False if CUDA graphs are used. + Examples -------- >>> import torch as pt @@ -119,8 +127,9 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = [nan, nan, nan], [nan, nan, nan]]), tensor([1., 1., nan, nan, nan, nan])) + ''' if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) - return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors) \ No newline at end of file + return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors) diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index 19dfa7d8..5d6e04e2 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -96,5 +96,10 @@ static tuple forward(const Tensor& positions, } TORCH_LIBRARY_IMPL(neighbors, CPU, m) { - m.impl("getNeighborPairs", &forward); -} \ No newline at end of file + m.impl("getNeighborPairs", + [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, + const Tensor& box_vectors, const bool &checkErrors){ + //The checkErrors flag is ignored, this function always checks for errors synchronously + return forward(positions, cutoff, max_num_neighbors, box_vectors); + }); +} diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 63ea9454..49cb7e2d 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -104,13 +106,47 @@ template __global__ void backward_kernel( atomicAdd(&grad_positions[i_atom][i_comp], grad); } +namespace detail{ + static std::exception_ptr tooManyNeighborsException = nullptr; + // Checks the too many neighbors flag and stores an exception if + // necessary to detail::tooManyNeighborsException. This function is + // intended to be launched via cudaLaunchHostFunc. + //data is a void pointer to a std::tuple, storing the + // maximum number of neighbors and whether to throw an uncatchable + // exception here (false) or store it for later (true). + void CUDART_CB checkTooManyNeighbors(void *data){ + int max_num_neighbors; + bool checkErrors; + std::tie(max_num_neighbors, checkErrors) = *static_cast*>(data); + // An exception thrown in a stream callback is not catchable (it + // runs in another thread), so we store it in an exception_ptr for + // it to be processed sometime later in the main thread. For + // performance reasons, the exception is thrown here + // asynchronously if the checkErrors flag is set to false + try{ + const int tooMan = tooManyNeighborsErrorFlag; + TORCH_CHECK(tooMan == 0, + "Some particle has too many neighbors, found " + + std::to_string(-tooMan) + " but max is " + + std::to_string(max_num_neighbors)); + } + catch(...){ + if(not checkErrors) + throw; + else + tooManyNeighborsException = std::current_exception(); + } + } +} + class Autograd : public Function { public: static tensor_list forward(AutogradContext* ctx, const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, - const Tensor& box_vectors) { + const Tensor& box_vectors, + bool checkErrors) { TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); @@ -143,18 +179,15 @@ public: const Tensor deltas = full({num_pairs, 3}, NAN, options); const Tensor distances = full(num_pairs, NAN, options); //Advice CUDA on expected usage of the error flag - //cudaStreamAttachMemAsync(stream, &tooManyNeighborsErrorFlag); cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), - cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); + cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetReadMostly, 0); + tooManyNeighborsErrorFlag = 0; + const CUDAStreamGuard guard(stream); AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { - const CUDAStreamGuard guard(stream); - tooManyNeighborsErrorFlag = 0; - const scalar_t cutoff_ = cutoff.to(); - TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive"); - cudaEvent_t event; - cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync); + const scalar_t cutoff_ = cutoff.to(); + TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive"); forward_kernel<<>>( num_all_pairs, get_accessor(positions), @@ -166,18 +199,26 @@ public: get_accessor(deltas), get_accessor(distances), get_accessor(box_vectors)); - cudaEventRecord(event, stream); - cudaEventSynchronize(event); - //Check the error flag - TORCH_CHECK(tooManyNeighborsErrorFlag == 0, "Some particle has too many neighbours, found " + - std::to_string(-tooManyNeighborsErrorFlag) + " but max is " + - std::to_string(max_num_neighbors.toInt())); - cudaEventDestroy(event); }); - - ctx->save_for_backward({neighbors, deltas, distances}); + //Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs + cudaHostFn_t h_fn = detail::checkTooManyNeighbors; + static std::tuple h_fn_data; + h_fn_data = {max_num_neighbors_, checkErrors}; + cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); + //Errors are thrown as exceptions asynchronously and in a way + //compatible with CUDA graphs. However, this way of throwing + //an exception makes it uncatchable, crashing the code. If + //the checkErrors flag is set to true an explicit + //synchronization barrier here forces to throw the exception + //from the main thread, making it catchable at the expense of + //a performance penalty each time the function is called. + if(checkErrors){ + cudaStreamSynchronize(stream); + if(detail::tooManyNeighborsException) + std::rethrow_exception(detail::tooManyNeighborsException); + } + ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; - return {neighbors, deltas, distances}; } @@ -209,14 +250,16 @@ public: get_accessor(grad_positions)); }); - return {grad_positions, Tensor(), Tensor(), Tensor()}; + return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor()}; } }; TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) { - m.impl("getNeighborPairs", - [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, const Tensor& box_vectors){ - const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, box_vectors); - return make_tuple(results[0], results[1], results[2]); - }); + m.impl("getNeighborPairs", + [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, + const Tensor& box_vectors, const bool &checkErrors){ + const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, + box_vectors, checkErrors); + return make_tuple(results[0], results[1], results[2]); + }); } diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp index d8dd5c5b..4614160d 100644 --- a/src/pytorch/neighbors/neighbors.cpp +++ b/src/pytorch/neighbors/neighbors.cpp @@ -1,5 +1,5 @@ #include TORCH_LIBRARY(neighbors, m) { - m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors) -> (Tensor neighbors, Tensor deltas, Tensor distances)"); -} \ No newline at end of file + m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances)"); +} From 8c1395269ff6bf8535959a3b8fd7257da162ece1 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 19 Jan 2023 15:08:41 +0100 Subject: [PATCH 03/41] Remove unnecessarily static variable --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 49cb7e2d..5a229fb3 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -202,8 +202,7 @@ public: }); //Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs cudaHostFn_t h_fn = detail::checkTooManyNeighbors; - static std::tuple h_fn_data; - h_fn_data = {max_num_neighbors_, checkErrors}; + std::tuple h_fn_data = {max_num_neighbors_, checkErrors}; cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); //Errors are thrown as exceptions asynchronously and in a way //compatible with CUDA graphs. However, this way of throwing From 928f12356683d58d8145176e195ec915fcefc75e Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 3 Mar 2023 12:33:22 +0100 Subject: [PATCH 04/41] Change the error handling of getNeighborPairs. - Add a new optional flag, sync_exceptions on top of the current check_errors. - Three behaviors are possible: 1. Default (both false). Operation is CUDA-graph compatible and an uncatchable exception is thrown in case of number of pairs being too high. 2. check_errors=True. Operation is CUDA-graph compatible. No exception is thrown and the number of found pairs is returned, which can be higher than max_number_pairs. 3. check_errors=False and sync_exceptions=True. Operation is NOT CUDA-graph compatible. The operation synchronizes to check for errors and throws a catchable exception if necessary. --- src/pytorch/neighbors/TestNeighbors.py | 18 +++- src/pytorch/neighbors/getNeighborPairs.py | 48 ++++++--- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 27 ++--- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 98 ++++++++++--------- src/pytorch/neighbors/neighbors.cpp | 2 +- 5 files changed, 115 insertions(+), 78 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 2391f648..6c3ef655 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -138,17 +138,25 @@ def test_neighbor_grads(dtype, num_atoms, grad): @pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) def test_too_many_neighbors(device, dtype): - if not pt.cuda.is_available() and device == 'cuda': pytest.skip('No GPU') - # 4 points result into 6 pairs, but there is a storage just for 4. + positions = pt.zeros((4, 3,), device=device, dtype=dtype) with pytest.raises(RuntimeError): - positions = pt.zeros((4, 3,), device=device, dtype=dtype) - # Omitting check_errors makes the exception non-catchable - getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) + # checkErrors = False will throw due to exceeding neighbours + # syncExceptions = True makes this exception catchable at the + # expense of performance (even when no error ocurred) + getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False, sync_exceptions=True) pt.cuda.synchronize() + # checkErrors = True will never throw due to exceeding neighbours, + # but will return the number of pairs found. + # syncExceptions is ignored in this case + neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) + assert number_found_pairs == 6 + + + @pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 5b8a0f29..052741d6 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -2,9 +2,14 @@ from typing import Optional, Tuple -def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False) -> Tuple[Tensor, Tensor]: - ''' - Returns indices and distances of atom pairs within a given cutoff distance. +def getNeighborPairs( + positions: Tensor, + cutoff: float, + max_num_neighbors: int = -1, + box_vectors: Optional[Tensor] = None, + check_errors: Optional[bool] = False, + sync_exceptions: Optional[bool] = False) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: + '''Returns indices and distances of atom pairs within a given cutoff distance. If `max_num_neighbors == -1` (default), all the atom pairs are returned, i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for @@ -45,12 +50,18 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`. If this is omitted, periodic boundary conditions are not applied. check_errors: bool, optional - If set to True the function is guaranteed to throw if an error - is encountered, synchronizing if necessary. - If set to False, an error arising in this function might crash - the code at some point after calling it. + If set to True the function does not throw due to a number of pairs larger than the maximum. + If set to False, an exception will be thrown in that case. + Defaults to False. + sync_exceptions: bool, optional + If set to True the function will synchronize to check for + errors and throw an exception in the caller thread if + necessary. + If set to False it is possible that an exception thrown by + this function cannot bbe catched and result in a crash. + This flag is ignored if check_errors is True. + This flag must be False for the getNeighborPairs operation to be CUDA graph compatible. Defaults to False. - Returns ------- neighbors: `torch.Tensor` @@ -69,19 +80,26 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = If an atom pair is separated by a larger distance than the cutoff, the distance is set to `NaN`. + number_found_pairs: `Optional[torch.Tensor]` + Present if check_errors=True. Contains the total number of + pairs found, which might exceed the requested + max_num_neighbors, leaving the rest of the output in an + undefined state. + Exceptions ---------- - If `max_num_neighbors > 0` and too small, `RuntimeError` is raised. + If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=True. Note ---- - The operation is compatible with CUDA Grahps, i.e. the shapes of the output - tensors are independed of the values of input tensors. + The operation can be compatible with CUDA Grahps, i.e. the shapes of the output + tensors are independed of the values of input tensors and no synchronizing operation is performed. + + For this to be the case sync_exceptions must be False. The CUDA implementation returns the atom pairs in non-determinist order, if `max_num_neighbors > 0`. - The check_errors argument is forced to False if CUDA graphs are used. Examples -------- @@ -132,4 +150,8 @@ def getNeighborPairs(positions: Tensor, cutoff: float, max_num_neighbors: int = if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) - return ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors) + neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions) + if check_errors is True: + return neighbors, deltas, distances, number_found_pairs + else: + return neighbors, deltas, distances diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index 5d6e04e2..c5bfb0c5 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -16,10 +16,11 @@ using torch::Tensor; using torch::outer; using torch::round; -static tuple forward(const Tensor& positions, - const Scalar& cutoff, - const Scalar& max_num_neighbors, - const Tensor& box_vectors) { +static tuple forward(const Tensor& positions, + const Scalar& cutoff, + const Scalar& max_num_neighbors, + const Tensor& box_vectors, + bool checkErrors) { TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); @@ -82,24 +83,26 @@ static tuple forward(const Tensor& positions, distances = distances.index({mask}); const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0); - TORCH_CHECK(num_pad >= 0, - "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\""); - + if (!checkErrors) { + TORCH_CHECK(num_pad >= 0, + "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\""); + } if (num_pad > 0) { neighbors = hstack({neighbors, full({2, num_pad}, -1, neighbors.options())}); deltas = vstack({deltas, full({num_pad, 3}, NAN, deltas.options())}); distances = hstack({distances, full({num_pad}, NAN, distances.options())}); } } - - return {neighbors, deltas, distances}; + Tensor num_pairs_found = torch::empty(1, indices.options().dtype(kInt32)); + num_pairs_found[0] = distances.size(0); + return {neighbors, deltas, distances, num_pairs_found}; } TORCH_LIBRARY_IMPL(neighbors, CPU, m) { m.impl("getNeighborPairs", [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, - const Tensor& box_vectors, const bool &checkErrors){ - //The checkErrors flag is ignored, this function always checks for errors synchronously - return forward(positions, cutoff, max_num_neighbors, box_vectors); + const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){ + //The syncExceptions flag is ignored, this function always throws synchronously + return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors); }); } diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 5a229fb3..639db5f5 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -106,37 +106,35 @@ template __global__ void backward_kernel( atomicAdd(&grad_positions[i_atom][i_comp], grad); } -namespace detail{ - static std::exception_ptr tooManyNeighborsException = nullptr; - // Checks the too many neighbors flag and stores an exception if - // necessary to detail::tooManyNeighborsException. This function is - // intended to be launched via cudaLaunchHostFunc. - //data is a void pointer to a std::tuple, storing the - // maximum number of neighbors and whether to throw an uncatchable - // exception here (false) or store it for later (true). - void CUDART_CB checkTooManyNeighbors(void *data){ +static std::exception_ptr tooManyNeighborsException = nullptr; +// Checks the too many neighbors flag and stores an exception if +// necessary to detail::tooManyNeighborsException. This function is +// intended to be launched via cudaLaunchHostFunc. +// data is a void pointer to a std::tuple, storing the +// maximum number of neighbors and whether to throw an uncatchable +// exception here or store it for later. +static void CUDART_CB checkTooManyNeighbors(void* data) { int max_num_neighbors; bool checkErrors; - std::tie(max_num_neighbors, checkErrors) = *static_cast*>(data); + bool syncExceptions; + std::tie(max_num_neighbors, checkErrors, syncExceptions) = *static_cast*>(data); // An exception thrown in a stream callback is not catchable (it // runs in another thread), so we store it in an exception_ptr for // it to be processed sometime later in the main thread. For // performance reasons, the exception is thrown here // asynchronously if the checkErrors flag is set to false - try{ - const int tooMan = tooManyNeighborsErrorFlag; - TORCH_CHECK(tooMan == 0, - "Some particle has too many neighbors, found " + - std::to_string(-tooMan) + " but max is " + - std::to_string(max_num_neighbors)); - } - catch(...){ - if(not checkErrors) - throw; - else - tooManyNeighborsException = std::current_exception(); + if (!checkErrors) { + try { + const int tooMan = tooManyNeighborsErrorFlag; + TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors)); + } + catch (...) { + if (not syncExceptions) + throw; + else + tooManyNeighborsException = std::current_exception(); + } } - } } class Autograd : public Function { @@ -146,7 +144,8 @@ public: const Scalar& cutoff, const Scalar& max_num_neighbors, const Tensor& box_vectors, - bool checkErrors) { + bool checkErrors, + bool syncExceptions) { TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); @@ -200,25 +199,30 @@ public: get_accessor(distances), get_accessor(box_vectors)); }); - //Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs - cudaHostFn_t h_fn = detail::checkTooManyNeighbors; - std::tuple h_fn_data = {max_num_neighbors_, checkErrors}; - cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); - //Errors are thrown as exceptions asynchronously and in a way - //compatible with CUDA graphs. However, this way of throwing - //an exception makes it uncatchable, crashing the code. If - //the checkErrors flag is set to true an explicit - //synchronization barrier here forces to throw the exception - //from the main thread, making it catchable at the expense of - //a performance penalty each time the function is called. - if(checkErrors){ - cudaStreamSynchronize(stream); - if(detail::tooManyNeighborsException) - std::rethrow_exception(detail::tooManyNeighborsException); - } - ctx->save_for_backward({neighbors, deltas, distances}); + // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs + cudaHostFn_t h_fn = checkTooManyNeighbors; + std::tuple h_fn_data = {max_num_neighbors_, checkErrors, syncExceptions}; + cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); + // With chekErrors=false and syncExceptions=false the state of + // the tooManyErrorsFlag is checked and exceptions are thrown + // asynchronously and in a way compatible with CUDA graphs. + // However, this way of throwing an exception makes it + // uncatchable, crashing the code. + //If checkErrors=false the syncExceptions=true an explicit + // synchronization barrier here forces to throw the exception + // from the main thread, making it catchable at the expense of + // a performance penalty each time the function is called. + //Otherwise, if checkErrors=true, no exception is thrown and + //the user is responsible to check if the number of pairs is + //too high + if (!checkErrors && syncExceptions) { + cudaStreamSynchronize(stream); + if (tooManyNeighborsException) + std::rethrow_exception(tooManyNeighborsException); + } + ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; - return {neighbors, deltas, distances}; + return {neighbors, deltas, distances, i_curr_pair}; } static tensor_list backward(AutogradContext* ctx, tensor_list grad_inputs) { @@ -249,16 +253,16 @@ public: get_accessor(grad_positions)); }); - return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor()}; + return {grad_positions, Tensor(), Tensor(), Tensor(), Tensor(), Tensor()}; } }; TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) { m.impl("getNeighborPairs", [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, - const Tensor& box_vectors, const bool &checkErrors){ - const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, - box_vectors, checkErrors); - return make_tuple(results[0], results[1], results[2]); + const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){ + const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, + box_vectors, checkErrors, syncExceptions); + return make_tuple(results[0], results[1], results[2], results[3]); }); } diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp index 4614160d..d5588608 100644 --- a/src/pytorch/neighbors/neighbors.cpp +++ b/src/pytorch/neighbors/neighbors.cpp @@ -1,5 +1,5 @@ #include TORCH_LIBRARY(neighbors, m) { - m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances)"); + m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors, bool syncExceptions) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)"); } From 477e9cd5f1e31c187cbfbc6b1a7b44e5de7d013a Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Mon, 6 Mar 2023 16:34:14 +0100 Subject: [PATCH 05/41] Make getNeighborPairs CUDA-graph compatible, add test for it --- src/pytorch/neighbors/TestNeighbors.py | 35 +++++++++++++++++++ src/pytorch/neighbors/getNeighborPairsCUDA.cu | 25 ++++++++----- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 6c3ef655..2a345601 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -156,6 +156,41 @@ def test_too_many_neighbors(device, dtype): assert number_found_pairs == 6 +def test_is_cuda_graph_compatible(): + device = 'cuda' + dtype = pt.float32 + num_atoms = 100 + # Generate random positions + positions = 10 * pt.randn((num_atoms, 3), device=device, dtype=dtype) + cutoff = 5 + # Get neighbor pairs + ref_neighbors = np.vstack(np.tril_indices(num_atoms, -1)) + ref_positions = positions.cpu().numpy() + ref_deltas = ref_positions[ref_neighbors[0]] - ref_positions[ref_neighbors[1]] + ref_distances = np.linalg.norm(ref_deltas, axis=1) + + # Filter the neighbor pairs + mask = ref_distances > cutoff + ref_neighbors[:, mask] = -1 + ref_deltas[mask, :] = np.nan + ref_distances[mask] = np.nan + + # Find the number of neighbors + num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances))) + + graph = pt.cuda.CUDAGraph() + s = pt.cuda.Stream() + s.wait_stream(pt.cuda.current_stream()) + with pt.cuda.stream(s): + for _ in range(3): + neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) + pt.cuda.synchronize() + + with pt.cuda.graph(graph): + neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) + + graph.replay() + pt.cuda.synchronize() @pytest.mark.parametrize('device', ['cpu', 'cuda']) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 639db5f5..7e2feac3 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -28,7 +28,7 @@ template __device__ __forceinline__ scalar_t sqrt_(scalar_t template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); }; template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); }; -__device__ __managed__ int32_t tooManyNeighborsErrorFlag; //Error flag for forward_kernel +__device__ __managed__ int32_t tooManyNeighborsErrorFlag; // Error flag for forward_kernel template __global__ void forward_kernel( const int32_t num_all_pairs, @@ -137,6 +137,12 @@ static void CUDART_CB checkTooManyNeighbors(void* data) { } } +static bool isStreamCapturing(cudaStream_t st) { + cudaStreamCaptureStatus graphStatus; + cudaStreamIsCapturing(st, &graphStatus); + return graphStatus == cudaStreamCaptureStatusActive; +} + class Autograd : public Function { public: static tensor_list forward(AutogradContext* ctx, @@ -146,7 +152,14 @@ public: const Tensor& box_vectors, bool checkErrors, bool syncExceptions) { - + const auto stream = getCurrentCUDAStream(positions.get_device()); + bool isCUDAGraphCapturing = isStreamCapturing(stream); + // Advice CUDA on expected usage of the error flag + if (!isCUDAGraphCapturing) { + cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); + cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetReadMostly, 0); + } + const CUDAStreamGuard guard(stream); TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3"); @@ -170,20 +183,14 @@ public: const int num_threads = 128; const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1); - const auto stream = getCurrentCUDAStream(positions.get_device()); const TensorOptions options = positions.options(); const Tensor i_curr_pair = zeros(1, options.dtype(kInt32)); const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32)); const Tensor deltas = full({num_pairs, 3}, NAN, options); const Tensor distances = full(num_pairs, NAN, options); - //Advice CUDA on expected usage of the error flag - cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), - cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); - cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), - cudaMemAdviseSetReadMostly, 0); + tooManyNeighborsErrorFlag = 0; - const CUDAStreamGuard guard(stream); AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { const scalar_t cutoff_ = cutoff.to(); TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive"); From 822c6916c50f7d2007f5c57d731a7e247c4a694d Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Mon, 6 Mar 2023 16:36:41 +0100 Subject: [PATCH 06/41] Remove incorrect comment --- src/pytorch/neighbors/TestNeighbors.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 2a345601..834cbf81 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -132,9 +132,6 @@ def test_neighbor_grads(dtype, num_atoms, grad): else: assert pt.allclose(positions_cpu.grad, positions_cuda.grad.cpu(), atol=1e-8, rtol=1e-5) -# The following test is only run on the CPU. Running it on the GPU triggers a -# CUDA assertion, which causes all tests run after it to fail. - @pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) def test_too_many_neighbors(device, dtype): From e46fe2d398a98670eaddb4e4e427098fb9a91125 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Mon, 6 Mar 2023 16:43:38 +0100 Subject: [PATCH 07/41] Change not by ! --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 7e2feac3..04bdb474 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -129,7 +129,7 @@ static void CUDART_CB checkTooManyNeighbors(void* data) { TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors)); } catch (...) { - if (not syncExceptions) + if (!syncExceptions) throw; else tooManyNeighborsException = std::current_exception(); From e80cd5ebcc7a959cab03e20ad316a0e1b6cfe744 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Tue, 7 Mar 2023 18:46:56 +0100 Subject: [PATCH 08/41] Move all torch.ops.load calls to the __init__.py scripts --- src/pytorch/BatchedNN.py | 2 -- src/pytorch/CFConv.py | 6 +----- src/pytorch/CFConvNeighbors.py | 5 +---- src/pytorch/SymmetryFunctions.py | 6 +----- src/pytorch/__init__.py | 7 ++++++- src/pytorch/neighbors/__init__.py | 7 ++++++- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/pytorch/BatchedNN.py b/src/pytorch/BatchedNN.py index 7902be42..79f51ac0 100644 --- a/src/pytorch/BatchedNN.py +++ b/src/pytorch/BatchedNN.py @@ -21,14 +21,12 @@ # SOFTWARE. # -import os import torch from torch import nn from torch import Tensor from torch.nn import functional as F from typing import List, NamedTuple, Tuple, Union -torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) batchedLinear = torch.ops.NNPOpsBatchedNN.BatchedLinear diff --git a/src/pytorch/CFConv.py b/src/pytorch/CFConv.py index 7229cf06..ab40c246 100644 --- a/src/pytorch/CFConv.py +++ b/src/pytorch/CFConv.py @@ -21,15 +21,11 @@ # SOFTWARE. # -import os.path import torch from torch import Tensor from NNPOps.CFConvNeighbors import CFConvNeighbors -torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) -torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) - class CFConv(torch.nn.Module): """ Optimized continious-filter convolution layer (CFConv) @@ -84,4 +80,4 @@ def __init__(self, gaussianWidth: float, activation: str, def forward(self, neighbors: CFConvNeighbors, positions: Tensor, input: Tensor) -> Tensor: - return CFConv.operation(self.holder, neighbors.holder, positions, input) \ No newline at end of file + return CFConv.operation(self.holder, neighbors.holder, positions, input) diff --git a/src/pytorch/CFConvNeighbors.py b/src/pytorch/CFConvNeighbors.py index e5ca8a9b..144f1b53 100644 --- a/src/pytorch/CFConvNeighbors.py +++ b/src/pytorch/CFConvNeighbors.py @@ -21,12 +21,9 @@ # SOFTWARE. # -import os.path import torch from torch import Tensor -torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) - class CFConvNeighbors(torch.nn.Module): """ Optimized nearest-neighbor implementation for the continious-filter convolution (CFConf) @@ -45,4 +42,4 @@ def __init__(self, cutoff: float) -> None: @torch.jit.export def build(self, positions: Tensor) -> None: - self.holder.build(positions) \ No newline at end of file + self.holder.build(positions) diff --git a/src/pytorch/SymmetryFunctions.py b/src/pytorch/SymmetryFunctions.py index b54aaa9c..99dd78ed 100644 --- a/src/pytorch/SymmetryFunctions.py +++ b/src/pytorch/SymmetryFunctions.py @@ -21,14 +21,10 @@ # SOFTWARE. # -import os.path from typing import List, Optional, Tuple import torch from torch import Tensor -torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) -torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) - Holder = torch.classes.NNPOpsANISymmetryFunctions.Holder operation = torch.ops.NNPOpsANISymmetryFunctions.operation @@ -124,4 +120,4 @@ def forward(self, species_positions: Tuple[Tensor, Tensor], radial, angular = operation(self.holder, positions[0], cell) features = torch.cat((radial, angular), dim=1).unsqueeze(0) - return species, features \ No newline at end of file + return species, features diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py index d60de89b..b6042421 100644 --- a/src/pytorch/__init__.py +++ b/src/pytorch/__init__.py @@ -1,5 +1,10 @@ ''' High-performance PyTorch operations for neural network potentials ''' +import os.path +import site +import torch +torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) +torch.classes.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) -from NNPOps.OptimizedTorchANI import OptimizedTorchANI \ No newline at end of file +from NNPOps.OptimizedTorchANI import OptimizedTorchANI diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py index 4869f31d..a81e6a2c 100644 --- a/src/pytorch/neighbors/__init__.py +++ b/src/pytorch/neighbors/__init__.py @@ -1,5 +1,10 @@ ''' Neighbor operations ''' +import site +import os +import torch -from NNPOps.neighbors.getNeighborPairs import getNeighborPairs \ No newline at end of file +torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) + +from NNPOps.neighbors.getNeighborPairs import getNeighborPairs From 2a7cd3a8968b14c2a5c9f91c35f0a9b0e1cb96b2 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 9 Mar 2023 09:32:03 +0100 Subject: [PATCH 09/41] Change how the location of libNNPOpsPyTorch.so is found at __init__ scripts --- src/pytorch/__init__.py | 6 +++--- src/pytorch/neighbors/__init__.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py index b6042421..d178744c 100644 --- a/src/pytorch/__init__.py +++ b/src/pytorch/__init__.py @@ -2,9 +2,9 @@ High-performance PyTorch operations for neural network potentials ''' import os.path -import site import torch -torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) -torch.classes.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) + +torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) +torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) from NNPOps.OptimizedTorchANI import OptimizedTorchANI diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py index a81e6a2c..0bf85d3f 100644 --- a/src/pytorch/neighbors/__init__.py +++ b/src/pytorch/neighbors/__init__.py @@ -1,10 +1,9 @@ ''' Neighbor operations ''' -import site -import os +import os.path import torch -torch.ops.load_library(os.path.join(site.getsitepackages()[-1],"NNPOps", "libNNPOpsPyTorch.so")) +torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) from NNPOps.neighbors.getNeighborPairs import getNeighborPairs From e4df3cf07c2c5a49a6e8865cc8af4bcb134fc192 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 9 Mar 2023 10:19:58 +0100 Subject: [PATCH 10/41] Remove spurious lines in CMakeLists.txt --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 23badd95..d748c974 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,8 +77,6 @@ install(FILES src/pytorch/__init__.py src/pytorch/OptimizedTorchANI.py src/pytorch/SpeciesConverter.py src/pytorch/SymmetryFunctions.py - src/pytorch/neighbors/__init__.py - src/pytorch/neighbors/getNeighborPairs.py DESTINATION ${Python3_SITEARCH}/${NAME}) install(FILES src/pytorch/neighbors/__init__.py src/pytorch/neighbors/getNeighborPairs.py From d6eb763775ba172e9fa2a13fb96668c14bc84f0f Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 9 Mar 2023 10:28:32 +0100 Subject: [PATCH 11/41] Update again how libNNPOpsPyTorch.so is found in __init__.py --- src/pytorch/__init__.py | 12 ++++++++++-- src/pytorch/neighbors/__init__.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py index d178744c..7a62ad6a 100644 --- a/src/pytorch/__init__.py +++ b/src/pytorch/__init__.py @@ -3,8 +3,16 @@ ''' import os.path import torch +import site + +# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages() +for path in site.getsitepackages(): + if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')): + torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')) + break +else: + # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file + torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) -torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) -torch.classes.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) from NNPOps.OptimizedTorchANI import OptimizedTorchANI diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py index 0bf85d3f..c0dd2ed5 100644 --- a/src/pytorch/neighbors/__init__.py +++ b/src/pytorch/neighbors/__init__.py @@ -3,7 +3,17 @@ ''' import os.path import torch +import site + +# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages() +for path in site.getsitepackages(): + if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')): + torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')) + break +else: + # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file + torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) + -torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) from NNPOps.neighbors.getNeighborPairs import getNeighborPairs From ca821c3fb0eb554a6c68e6516c8b605c883a0053 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 9 Mar 2023 10:31:48 +0100 Subject: [PATCH 12/41] Remove redundant torch load --- src/pytorch/neighbors/__init__.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py index c0dd2ed5..6e3d5f60 100644 --- a/src/pytorch/neighbors/__init__.py +++ b/src/pytorch/neighbors/__init__.py @@ -1,19 +1,6 @@ ''' Neighbor operations ''' -import os.path import torch -import site - -# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages() -for path in site.getsitepackages(): - if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')): - torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')) - break -else: - # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file - torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) - - from NNPOps.neighbors.getNeighborPairs import getNeighborPairs From 676f83be7f968c5cc2e6fd20b3879caa04a0ad93 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 9 Mar 2023 12:38:04 +0100 Subject: [PATCH 13/41] Skip CUDA graph test if no GPU is available --- src/pytorch/neighbors/TestNeighbors.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index e8bcbfbc..92e13eda 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -94,7 +94,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs): @pytest.mark.parametrize('num_atoms', [1, 2, 3, 4, 5, 10, 100, 1000]) @pytest.mark.parametrize('grad', ['deltas', 'distances', 'combined']) def test_neighbor_grads(device, dtype, num_atoms, grad): - + if not pt.cuda.is_available() and device == 'cuda': pytest.skip('No GPU') @@ -115,7 +115,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad): positions.requires_grad_(True) print(positions) neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff) - + assert pt.all(neighbors > -1) assert pt.all(neighbors == ref_neighbors) assert pt.allclose(deltas, ref_deltas) @@ -133,7 +133,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad): (deltas.sum() + distances.sum()).backward() else: raise ValueError('grad') - + if dtype == pt.float32: assert pt.allclose(ref_positions.grad, positions.grad, atol=1e-3, rtol=1e-3) else: @@ -162,6 +162,8 @@ def test_too_many_neighbors(device, dtype): def test_is_cuda_graph_compatible(): + if not pt.cuda.is_available(): + pytest.skip('No GPU') device = 'cuda' dtype = pt.float32 num_atoms = 100 From d05656b6fddd002750aecb80a227c5df734511fe Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Tue, 14 Mar 2023 12:47:16 +0100 Subject: [PATCH 14/41] Remove incorrect path in __init__ --- src/pytorch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py index 7a62ad6a..56119c13 100644 --- a/src/pytorch/__init__.py +++ b/src/pytorch/__init__.py @@ -12,7 +12,7 @@ break else: # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file - torch.ops.load_library(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'libNNPOpsPyTorch.so')) + torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) from NNPOps.OptimizedTorchANI import OptimizedTorchANI From 4fb4b2e6d22bdb8c15b2c59cbaa117576a2b6393 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 15 Mar 2023 10:21:10 +0100 Subject: [PATCH 15/41] Use relative path to load NNPOps library in __init__.py --- src/pytorch/__init__.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/pytorch/__init__.py b/src/pytorch/__init__.py index 56119c13..645d4399 100644 --- a/src/pytorch/__init__.py +++ b/src/pytorch/__init__.py @@ -3,16 +3,8 @@ ''' import os.path import torch -import site -# look for NNPOps/libNNPOpsPyTorch.so in all the paths returned by site.getsitepackages() -for path in site.getsitepackages(): - if os.path.exists(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')): - torch.ops.load_library(os.path.join(path, 'NNPOps/libNNPOpsPyTorch.so')) - break -else: - # if we didn't find it, look for NNPOps/libNNPOpsPyTorch.so in the same directory as this file - torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) +torch.ops.load_library(os.path.join(os.path.dirname(__file__), 'libNNPOpsPyTorch.so')) from NNPOps.OptimizedTorchANI import OptimizedTorchANI From bf5658017db3459d8d4ae9f3dc1f02bc5ada71a6 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 15 Mar 2023 10:40:35 +0100 Subject: [PATCH 16/41] Copy test scripts to build directory, run them there --- CMakeLists.txt | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d748c974..6c414374 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,16 +56,26 @@ foreach(TEST_PATH ${TEST_PATHS}) add_test(${TEST_NAME} ${TEST_NAME}) endforeach() -# Tests of PyTorch wrappers -add_test(TestBatchedNN pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestBatchedNN.py) -add_test(TestCFConv pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestCFConv.py) -add_test(TestCFConvNeighbors pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestCFConvNeighbors.py) -add_test(TestEnergyShifter pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestEnergyShifter.py) -add_test(TestOptimizedTorchANI pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestOptimizedTorchANI.py) -add_test(TestSpeciesConverter pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSpeciesConverter.py) -add_test(TestSymmetryFunctions pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/TestSymmetryFunctions.py) -add_test(TestNeighbors pytest -v ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/TestNeighbors.py) -add_test(TestGetNeighborPairs pytest -v --doctest-modules ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py) +# Move test scripts to a test folder in the build directory, create test folder if necessary +add_custom_command(TARGET ${LIBRARY} POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test) +add_custom_command(TARGET ${LIBRARY} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_SOURCE_DIR}/src/pytorch/Test*.py + ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/Test*.py + ${CMAKE_SOURCE_DIR}/src/pytorch/neighbors/getNeighborPairs.py + ${CMAKE_BINARY_DIR}/test) +add_custom_command(TARGET ${LIBRARY} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_SOURCE_DIR}/src/pytorch/molecules + ${CMAKE_BINARY_DIR}/test/molecules) + +# Add tests for all scripts in the test directory +file(GLOB_RECURSE PYTHON_TEST_PATHS ${CMAKE_BINARY_DIR}/test/Test*.py) +foreach(TEST_PATH ${PYTHON_TEST_PATHS}) + cmake_path(GET TEST_PATH STEM TEST_NAME) + add_test(${TEST_NAME} pytest -v ${CMAKE_BINARY_DIR}/test/${TEST_NAME}.py) +endforeach() +add_test(TestGetNeighborPairs pytest -v --doctest-modules ${CMAKE_BINARY_DIR}/test/getNeighborPairs.py) # Installation install(TARGETS ${LIBRARY} DESTINATION ${Python3_SITEARCH}/${NAME}) From 947f4d874abdf0eead0ff55506cb9707945e6bd6 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 15 Mar 2023 10:42:30 +0100 Subject: [PATCH 17/41] Remove unnecessary import --- src/pytorch/neighbors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch/neighbors/__init__.py b/src/pytorch/neighbors/__init__.py index 6e3d5f60..f6679583 100644 --- a/src/pytorch/neighbors/__init__.py +++ b/src/pytorch/neighbors/__init__.py @@ -1,6 +1,5 @@ ''' Neighbor operations ''' -import torch from NNPOps.neighbors.getNeighborPairs import getNeighborPairs From ae82f90aa481aa148bb052f8192bfbe80c3f4b42 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 17 Mar 2023 18:42:42 +0100 Subject: [PATCH 18/41] Some fixes for CUDA graph support in getNEighborPairs --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 65 +++++++++---------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 04bdb474..d1494545 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -115,25 +115,22 @@ static std::exception_ptr tooManyNeighborsException = nullptr; // exception here or store it for later. static void CUDART_CB checkTooManyNeighbors(void* data) { int max_num_neighbors; - bool checkErrors; bool syncExceptions; - std::tie(max_num_neighbors, checkErrors, syncExceptions) = *static_cast*>(data); + std::tie(max_num_neighbors, syncExceptions) = *static_cast*>(data); // An exception thrown in a stream callback is not catchable (it // runs in another thread), so we store it in an exception_ptr for // it to be processed sometime later in the main thread. For // performance reasons, the exception is thrown here // asynchronously if the checkErrors flag is set to false - if (!checkErrors) { - try { - const int tooMan = tooManyNeighborsErrorFlag; - TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " but max is " + std::to_string(max_num_neighbors)); - } - catch (...) { - if (!syncExceptions) - throw; - else - tooManyNeighborsException = std::current_exception(); - } + try { + const int tooMan = tooManyNeighborsErrorFlag; + TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors)); + } + catch (...) { + if (!syncExceptions) + throw; + else + tooManyNeighborsException = std::current_exception(); } } @@ -164,8 +161,7 @@ public: TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3"); TORCH_CHECK(positions.is_contiguous(), "Expected \"positions\" to be contiguous"); - - const int max_num_neighbors_ = max_num_neighbors.to(); + int max_num_neighbors_ = max_num_neighbors.to(); TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1, "Expected \"max_num_neighbors\" to be positive or equal to -1"); @@ -207,26 +203,29 @@ public: get_accessor(box_vectors)); }); // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs - cudaHostFn_t h_fn = checkTooManyNeighbors; - std::tuple h_fn_data = {max_num_neighbors_, checkErrors, syncExceptions}; - cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); - // With chekErrors=false and syncExceptions=false the state of - // the tooManyErrorsFlag is checked and exceptions are thrown - // asynchronously and in a way compatible with CUDA graphs. - // However, this way of throwing an exception makes it - // uncatchable, crashing the code. - //If checkErrors=false the syncExceptions=true an explicit - // synchronization barrier here forces to throw the exception - // from the main thread, making it catchable at the expense of - // a performance penalty each time the function is called. - //Otherwise, if checkErrors=true, no exception is thrown and - //the user is responsible to check if the number of pairs is - //too high - if (!checkErrors && syncExceptions) { + if(!checkErrors){ + static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors; + static std::tuple h_fn_data; + h_fn_data = {max_num_neighbors_, syncExceptions}; + cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); + // With chekErrors=false and syncExceptions=false the state of + // the tooManyErrorsFlag is checked and exceptions are thrown + // asynchronously and in a way compatible with CUDA graphs. + // However, this way of throwing an exception makes it + // uncatchable, crashing the code. + //If checkErrors=false the syncExceptions=true an explicit + // synchronization barrier here forces to throw the exception + // from the main thread, making it catchable at the expense of + // a performance penalty each time the function is called. + //Otherwise, if checkErrors=true, no exception is thrown and + //the user is responsible to check if the number of pairs is + //too high + if (syncExceptions) { cudaStreamSynchronize(stream); if (tooManyNeighborsException) - std::rethrow_exception(tooManyNeighborsException); - } + std::rethrow_exception(tooManyNeighborsException); + } + } ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; return {neighbors, deltas, distances, i_curr_pair}; From 46256842954cc014709ee5903e0a75435e7036f6 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 16:29:13 +0100 Subject: [PATCH 19/41] Reverse logic for check_errors in getNeighborPairs.py --- src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 052741d6..d06e7aec 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -50,16 +50,16 @@ def getNeighborPairs( where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`. If this is omitted, periodic boundary conditions are not applied. check_errors: bool, optional - If set to True the function does not throw due to a number of pairs larger than the maximum. - If set to False, an exception will be thrown in that case. + If set to False the function does not raise due to a number of pairs larger than the maximum. + If set to True, an exception will be raised in that case. Defaults to False. sync_exceptions: bool, optional If set to True the function will synchronize to check for - errors and throw an exception in the caller thread if + errors and raise an exception in the caller thread if necessary. - If set to False it is possible that an exception thrown by + If set to False it is possible that an exception raised by this function cannot bbe catched and result in a crash. - This flag is ignored if check_errors is True. + This flag is ignored if check_errors is False. This flag must be False for the getNeighborPairs operation to be CUDA graph compatible. Defaults to False. Returns @@ -81,14 +81,14 @@ def getNeighborPairs( the distance is set to `NaN`. number_found_pairs: `Optional[torch.Tensor]` - Present if check_errors=True. Contains the total number of + Present if check_errors=False. Contains the total number of pairs found, which might exceed the requested max_num_neighbors, leaving the rest of the output in an undefined state. Exceptions ---------- - If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=True. + If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=False. Note ---- @@ -151,7 +151,7 @@ def getNeighborPairs( if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions) - if check_errors is True: + if check_errors is False: return neighbors, deltas, distances, number_found_pairs else: return neighbors, deltas, distances From d711a3ca7e3889661d93eff001d46fcc9167a48b Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 16:37:25 +0100 Subject: [PATCH 20/41] Reverse check_errors flag in the rest of the getNeighborPair-related files --- src/pytorch/neighbors/TestNeighbors.py | 8 +++---- src/pytorch/neighbors/getNeighborPairs.py | 23 +++++++++++-------- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 2 +- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 10 ++++---- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 92e13eda..49e01b72 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -148,16 +148,16 @@ def test_too_many_neighbors(device, dtype): # 4 points result into 6 pairs, but there is a storage just for 4. positions = pt.zeros((4, 3,), device=device, dtype=dtype) with pytest.raises(RuntimeError): - # checkErrors = False will throw due to exceeding neighbours + # checkErrors = True will throw due to exceeding neighbours # syncExceptions = True makes this exception catchable at the # expense of performance (even when no error ocurred) - getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False, sync_exceptions=True) + getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True, sync_exceptions=True) pt.cuda.synchronize() - # checkErrors = True will never throw due to exceeding neighbours, + # checkErrors = False will never throw due to exceeding neighbours, # but will return the number of pairs found. # syncExceptions is ignored in this case - neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) + neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False) assert number_found_pairs == 6 diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index d06e7aec..60913a04 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -3,13 +3,14 @@ def getNeighborPairs( - positions: Tensor, - cutoff: float, - max_num_neighbors: int = -1, - box_vectors: Optional[Tensor] = None, - check_errors: Optional[bool] = False, - sync_exceptions: Optional[bool] = False) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: - '''Returns indices and distances of atom pairs within a given cutoff distance. + positions: Tensor, + cutoff: float, + max_num_neighbors: int = -1, + box_vectors: Optional[Tensor] = None, + check_errors: Optional[bool] = False, + sync_exceptions: Optional[bool] = False, +) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: + """Returns indices and distances of atom pairs within a given cutoff distance. If `max_num_neighbors == -1` (default), all the atom pairs are returned, i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for @@ -58,7 +59,7 @@ def getNeighborPairs( errors and raise an exception in the caller thread if necessary. If set to False it is possible that an exception raised by - this function cannot bbe catched and result in a crash. + this function cannot be catched and result in a crash. This flag is ignored if check_errors is False. This flag must be False for the getNeighborPairs operation to be CUDA graph compatible. Defaults to False. @@ -146,11 +147,13 @@ def getNeighborPairs( [nan, nan, nan]]), tensor([1., 1., nan, nan, nan, nan])) - ''' + """ if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) - neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs(positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions) + neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( + positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions + ) if check_errors is False: return neighbors, deltas, distances, number_found_pairs else: diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index e35fbb96..f4de2d29 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -84,7 +84,7 @@ static tuple forward(const Tensor& positions, distances = distances.index({mask}); const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0); - if (!checkErrors) { + if (checkErrors) { TORCH_CHECK(num_pad >= 0, "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\""); } diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index d1494545..b734f3b2 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -110,7 +110,7 @@ static std::exception_ptr tooManyNeighborsException = nullptr; // Checks the too many neighbors flag and stores an exception if // necessary to detail::tooManyNeighborsException. This function is // intended to be launched via cudaLaunchHostFunc. -// data is a void pointer to a std::tuple, storing the +// data is a void pointer to a std::tuple, storing the // maximum number of neighbors and whether to throw an uncatchable // exception here or store it for later. static void CUDART_CB checkTooManyNeighbors(void* data) { @@ -121,7 +121,7 @@ static void CUDART_CB checkTooManyNeighbors(void* data) { // runs in another thread), so we store it in an exception_ptr for // it to be processed sometime later in the main thread. For // performance reasons, the exception is thrown here - // asynchronously if the checkErrors flag is set to false + // asynchronously if the checkErrors flag is set to true try { const int tooMan = tooManyNeighborsErrorFlag; TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors)); @@ -203,7 +203,7 @@ public: get_accessor(box_vectors)); }); // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs - if(!checkErrors){ + if(checkErrors){ static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors; static std::tuple h_fn_data; h_fn_data = {max_num_neighbors_, syncExceptions}; @@ -213,11 +213,11 @@ public: // asynchronously and in a way compatible with CUDA graphs. // However, this way of throwing an exception makes it // uncatchable, crashing the code. - //If checkErrors=false the syncExceptions=true an explicit + //If checkErrors=true the syncExceptions=true an explicit // synchronization barrier here forces to throw the exception // from the main thread, making it catchable at the expense of // a performance penalty each time the function is called. - //Otherwise, if checkErrors=true, no exception is thrown and + //Otherwise, if checkErrors=false, no exception is thrown and //the user is responsible to check if the number of pairs is //too high if (syncExceptions) { From 400ceede3c8b25ca1a04573d87f6bd0d918deda7 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 16:39:27 +0100 Subject: [PATCH 21/41] Clarify documentation on the error raised by getNeighborPairs --- src/pytorch/neighbors/getNeighborPairs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 60913a04..e5e28114 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -52,7 +52,7 @@ def getNeighborPairs( If this is omitted, periodic boundary conditions are not applied. check_errors: bool, optional If set to False the function does not raise due to a number of pairs larger than the maximum. - If set to True, an exception will be raised in that case. + If set to True, a RuntimeError will be raised in that case. Defaults to False. sync_exceptions: bool, optional If set to True the function will synchronize to check for From c36243b36955fa13e4c2f0f1dc552bfa11a60d66 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 16:42:59 +0100 Subject: [PATCH 22/41] Always return the number of found pairs in getNeighborPairs --- src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index e5e28114..544d8124 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -9,7 +9,7 @@ def getNeighborPairs( box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False, sync_exceptions: Optional[bool] = False, -) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: +) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Returns indices and distances of atom pairs within a given cutoff distance. If `max_num_neighbors == -1` (default), all the atom pairs are returned, @@ -81,11 +81,10 @@ def getNeighborPairs( If an atom pair is separated by a larger distance than the cutoff, the distance is set to `NaN`. - number_found_pairs: `Optional[torch.Tensor]` - Present if check_errors=False. Contains the total number of - pairs found, which might exceed the requested - max_num_neighbors, leaving the rest of the output in an - undefined state. + number_found_pairs: `torch.Tensor` + Contains the total number of pairs found, which might exceed + the requested max_num_neighbors, leaving the rest of the + output in an undefined state. Exceptions ---------- @@ -154,7 +153,4 @@ def getNeighborPairs( neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions ) - if check_errors is False: - return neighbors, deltas, distances, number_found_pairs - else: - return neighbors, deltas, distances + return neighbors, deltas, distances, number_found_pairs From 8da1c5d77bbe0c95036dcb8382d29ee4aefd62b0 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 17:00:26 +0100 Subject: [PATCH 23/41] Revert "Always return the number of found pairs in getNeighborPairs" This reverts commit c36243b36955fa13e4c2f0f1dc552bfa11a60d66. --- src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 544d8124..e5e28114 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -9,7 +9,7 @@ def getNeighborPairs( box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False, sync_exceptions: Optional[bool] = False, -) -> Tuple[Tensor, Tensor, Tensor, Tensor]: +) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: """Returns indices and distances of atom pairs within a given cutoff distance. If `max_num_neighbors == -1` (default), all the atom pairs are returned, @@ -81,10 +81,11 @@ def getNeighborPairs( If an atom pair is separated by a larger distance than the cutoff, the distance is set to `NaN`. - number_found_pairs: `torch.Tensor` - Contains the total number of pairs found, which might exceed - the requested max_num_neighbors, leaving the rest of the - output in an undefined state. + number_found_pairs: `Optional[torch.Tensor]` + Present if check_errors=False. Contains the total number of + pairs found, which might exceed the requested + max_num_neighbors, leaving the rest of the output in an + undefined state. Exceptions ---------- @@ -153,4 +154,7 @@ def getNeighborPairs( neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions ) - return neighbors, deltas, distances, number_found_pairs + if check_errors is False: + return neighbors, deltas, distances, number_found_pairs + else: + return neighbors, deltas, distances From 829ee5ba2c2787205cc1d259f8b095b99ebe3d26 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 22 Mar 2023 17:00:59 +0100 Subject: [PATCH 24/41] Fix check_error interpretation in getNeighborPairs.py --- src/pytorch/neighbors/getNeighborPairs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index e5e28114..6720c9c7 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -154,7 +154,7 @@ def getNeighborPairs( neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions ) - if check_errors is False: + if check_errors is True: return neighbors, deltas, distances, number_found_pairs else: return neighbors, deltas, distances From 73c3e5859d5c7e7d579a02de5fc2b92cf6ed603a Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 23 Mar 2023 11:56:16 +0100 Subject: [PATCH 25/41] Add return number of pairs functionality again This reverts commit 8da1c5d77bbe0c95036dcb8382d29ee4aefd62b0. --- src/pytorch/neighbors/getNeighborPairs.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 6720c9c7..544d8124 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -9,7 +9,7 @@ def getNeighborPairs( box_vectors: Optional[Tensor] = None, check_errors: Optional[bool] = False, sync_exceptions: Optional[bool] = False, -) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor]]: +) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Returns indices and distances of atom pairs within a given cutoff distance. If `max_num_neighbors == -1` (default), all the atom pairs are returned, @@ -81,11 +81,10 @@ def getNeighborPairs( If an atom pair is separated by a larger distance than the cutoff, the distance is set to `NaN`. - number_found_pairs: `Optional[torch.Tensor]` - Present if check_errors=False. Contains the total number of - pairs found, which might exceed the requested - max_num_neighbors, leaving the rest of the output in an - undefined state. + number_found_pairs: `torch.Tensor` + Contains the total number of pairs found, which might exceed + the requested max_num_neighbors, leaving the rest of the + output in an undefined state. Exceptions ---------- @@ -154,7 +153,4 @@ def getNeighborPairs( neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions ) - if check_errors is True: - return neighbors, deltas, distances, number_found_pairs - else: - return neighbors, deltas, distances + return neighbors, deltas, distances, number_found_pairs From c2210f38484c96f27151298ab1ea798f664d509d Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 23 Mar 2023 12:35:39 +0100 Subject: [PATCH 26/41] Update tests with new getNeighborPairs interface --- src/pytorch/neighbors/TestNeighbors.py | 12 ++++----- src/pytorch/neighbors/getNeighborPairs.py | 32 ++++++++++------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 10cd43d2..932af259 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -59,7 +59,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs): max_num_neighbors = -1 if all_pairs else max(int(np.ceil(num_neighbors / num_atoms)), 1) # Compute results - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors) # Check device assert neighbors.device == positions.device @@ -114,7 +114,7 @@ def test_neighbor_grads(device, dtype, num_atoms, grad): # Compute values using NNPOps positions.requires_grad_(True) print(positions) - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff) assert pt.all(neighbors > -1) assert pt.all(neighbors == ref_neighbors) @@ -190,11 +190,11 @@ def test_is_cuda_graph_compatible(): s.wait_stream(pt.cuda.current_stream()) with pt.cuda.stream(s): for _ in range(3): - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) pt.cuda.synchronize() with pt.cuda.graph(graph): - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=num_neighbors+1) graph.replay() pt.cuda.synchronize() @@ -234,7 +234,7 @@ def test_periodic_neighbors(device, dtype): max_num_neighbors = max(int(np.ceil(num_neighbors / num_atoms)), 1) # Compute results - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors, box_vectors=box_vectors) # Check device assert neighbors.device == positions.device @@ -272,7 +272,7 @@ class ForceModule(pt.nn.Module): def forward(self, positions): - neighbors, deltas, distances = getNeighborPairs(positions, cutoff=1.0) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=1.0) mask = pt.isnan(distances) distances = distances[~mask] return pt.sum(distances**2) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 544d8124..e30c8dfa 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -114,7 +114,7 @@ def getNeighborPairs( tensor([[1., 0., 0.], [2., 0., 0.], [1., 0., 0.]]), - tensor([1., 2., 1.])) + tensor([1., 2., 1.]), tensor([3], dtype=torch.int32)) >>> getNeighborPairs(positions, cutoff=1.5) # doctest: +NORMALIZE_WHITESPACE (tensor([[ 1, -1, 2], @@ -122,29 +122,25 @@ def getNeighborPairs( tensor([[1., 0., 0.], [nan, nan, nan], [1., 0., 0.]]), - tensor([1., nan, 1.])) + tensor([1., nan, 1.]), tensor([3], dtype=torch.int32)) >>> getNeighborPairs(positions, cutoff=3.0, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE (tensor([[ 1, 2, 2, -1, -1, -1], - [ 0, 0, 1, -1, -1, -1]], dtype=torch.int32), - tensor([[1., 0., 0.], - [2., 0., 0.], - [1., 0., 0.], - [nan, nan, nan], - [nan, nan, nan], - [nan, nan, nan]]), - tensor([1., 2., 1., nan, nan, nan])) + [ 0, 0, 1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.], + [2., 0., 0.], + [1., 0., 0.], + [nan, nan, nan], + [nan, nan, nan], + [nan, nan, nan]]), tensor([1., 2., 1., nan, nan, nan]), tensor([6], dtype=torch.int32)) >>> getNeighborPairs(positions, cutoff=1.5, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE (tensor([[ 1, 2, -1, -1, -1, -1], - [ 0, 1, -1, -1, -1, -1]], dtype=torch.int32), - tensor([[1., 0., 0.], - [1., 0., 0.], - [nan, nan, nan], - [nan, nan, nan], - [nan, nan, nan], - [nan, nan, nan]]), - tensor([1., 1., nan, nan, nan, nan])) + [ 0, 1, -1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.], + [1., 0., 0.], + [nan, nan, nan], + [nan, nan, nan], + [nan, nan, nan], + [nan, nan, nan]]), tensor([1., 1., nan, nan, nan, nan]), tensor([6], dtype=torch.int32)) """ From fba2b46650769c121b3a86ad61ae39ea35798b62 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 23 Mar 2023 12:35:52 +0100 Subject: [PATCH 27/41] Fix type decorator preventing jit.script from working on getNeighborPairs --- src/pytorch/neighbors/getNeighborPairs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index e30c8dfa..2a86e432 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -7,8 +7,8 @@ def getNeighborPairs( cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, - check_errors: Optional[bool] = False, - sync_exceptions: Optional[bool] = False, + check_errors: bool = False, + sync_exceptions: bool = False, ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Returns indices and distances of atom pairs within a given cutoff distance. From 562d5221310b896b2ac7d8427cc8d0c3e925358e Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 29 Mar 2023 15:03:16 +0200 Subject: [PATCH 28/41] Remove sync_exceptions flag, simplifying the behavior and relation with CUDA graphs. If check_errors=False (the default) getNeighborPairs does not check for errors and is compatible with graphs. If check_errors=True, the function raises if necessary but it is incompatible with graphs --- src/pytorch/neighbors/TestNeighbors.py | 10 +-- src/pytorch/neighbors/getNeighborPairs.py | 18 +--- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 2 +- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 87 +++---------------- src/pytorch/neighbors/neighbors.cpp | 2 +- 5 files changed, 23 insertions(+), 96 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index 932af259..dfe57a17 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -148,15 +148,11 @@ def test_too_many_neighbors(device, dtype): # 4 points result into 6 pairs, but there is a storage just for 4. positions = pt.zeros((4, 3,), device=device, dtype=dtype) with pytest.raises(RuntimeError): - # checkErrors = True will throw due to exceeding neighbours - # syncExceptions = True makes this exception catchable at the - # expense of performance (even when no error ocurred) - getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True, sync_exceptions=True) + # checkErrors = True will raise due to exceeding neighbours + getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) pt.cuda.synchronize() - # checkErrors = False will never throw due to exceeding neighbours, - # but will return the number of pairs found. - # syncExceptions is ignored in this case + # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False) assert number_found_pairs == 6 diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 2a86e432..4048bf14 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -7,8 +7,7 @@ def getNeighborPairs( cutoff: float, max_num_neighbors: int = -1, box_vectors: Optional[Tensor] = None, - check_errors: bool = False, - sync_exceptions: bool = False, + check_errors: bool = False ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Returns indices and distances of atom pairs within a given cutoff distance. @@ -54,15 +53,6 @@ def getNeighborPairs( If set to False the function does not raise due to a number of pairs larger than the maximum. If set to True, a RuntimeError will be raised in that case. Defaults to False. - sync_exceptions: bool, optional - If set to True the function will synchronize to check for - errors and raise an exception in the caller thread if - necessary. - If set to False it is possible that an exception raised by - this function cannot be catched and result in a crash. - This flag is ignored if check_errors is False. - This flag must be False for the getNeighborPairs operation to be CUDA graph compatible. - Defaults to False. Returns ------- neighbors: `torch.Tensor` @@ -88,14 +78,14 @@ def getNeighborPairs( Exceptions ---------- - If `max_num_neighbors > 0` and too small, `RuntimeError` is raised unless check_errors=False. + If `max_num_neighbors > 0` and too small, `RuntimeError` is raised if check_errors=True. Note ---- The operation can be compatible with CUDA Grahps, i.e. the shapes of the output tensors are independed of the values of input tensors and no synchronizing operation is performed. - For this to be the case sync_exceptions must be False. + For this to be the case check_errors must be False The CUDA implementation returns the atom pairs in non-determinist order, if `max_num_neighbors > 0`. @@ -147,6 +137,6 @@ def getNeighborPairs( if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( - positions, cutoff, max_num_neighbors, box_vectors, check_errors, sync_exceptions + positions, cutoff, max_num_neighbors, box_vectors, check_errors ) return neighbors, deltas, distances, number_found_pairs diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index f4de2d29..3c669534 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -102,7 +102,7 @@ static tuple forward(const Tensor& positions, TORCH_LIBRARY_IMPL(neighbors, CPU, m) { m.impl("getNeighborPairs", [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, - const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){ + const Tensor& box_vectors, const bool &checkErrors){ //The syncExceptions flag is ignored, this function always throws synchronously return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors); }); diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index b734f3b2..8ee51b2b 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -28,8 +28,6 @@ template __device__ __forceinline__ scalar_t sqrt_(scalar_t template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); }; template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); }; -__device__ __managed__ int32_t tooManyNeighborsErrorFlag; // Error flag for forward_kernel - template __global__ void forward_kernel( const int32_t num_all_pairs, const Accessor positions, @@ -68,19 +66,15 @@ template __global__ void forward_kernel( if (distance2 > cutoff2) return; const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1); - //If the maximum number of neighbours is surpassed encode the - //number of pairs found in a flag and exit - if(i_pair >= neighbors.size(1)){ - atomicMin(&tooManyNeighborsErrorFlag, -i_pair); - return; + //We handle too many neighbors outside of the kernel + if(i_pair < neighbors.size(1)){ + neighbors[0][i_pair] = row; + neighbors[1][i_pair] = column; + deltas[i_pair][0] = delta_x; + deltas[i_pair][1] = delta_y; + deltas[i_pair][2] = delta_z; + distances[i_pair] = sqrt_(distance2); } - - neighbors[0][i_pair] = row; - neighbors[1][i_pair] = column; - deltas[i_pair][0] = delta_x; - deltas[i_pair][1] = delta_y; - deltas[i_pair][2] = delta_z; - distances[i_pair] = sqrt_(distance2); } template __global__ void backward_kernel( @@ -106,33 +100,6 @@ template __global__ void backward_kernel( atomicAdd(&grad_positions[i_atom][i_comp], grad); } -static std::exception_ptr tooManyNeighborsException = nullptr; -// Checks the too many neighbors flag and stores an exception if -// necessary to detail::tooManyNeighborsException. This function is -// intended to be launched via cudaLaunchHostFunc. -// data is a void pointer to a std::tuple, storing the -// maximum number of neighbors and whether to throw an uncatchable -// exception here or store it for later. -static void CUDART_CB checkTooManyNeighbors(void* data) { - int max_num_neighbors; - bool syncExceptions; - std::tie(max_num_neighbors, syncExceptions) = *static_cast*>(data); - // An exception thrown in a stream callback is not catchable (it - // runs in another thread), so we store it in an exception_ptr for - // it to be processed sometime later in the main thread. For - // performance reasons, the exception is thrown here - // asynchronously if the checkErrors flag is set to true - try { - const int tooMan = tooManyNeighborsErrorFlag; - TORCH_CHECK(tooMan == 0, "Some particle has too many neighbors, found " + std::to_string(-tooMan) + " total pairs but max per particle is " + std::to_string(max_num_neighbors)); - } - catch (...) { - if (!syncExceptions) - throw; - else - tooManyNeighborsException = std::current_exception(); - } -} static bool isStreamCapturing(cudaStream_t st) { cudaStreamCaptureStatus graphStatus; @@ -147,15 +114,9 @@ public: const Scalar& cutoff, const Scalar& max_num_neighbors, const Tensor& box_vectors, - bool checkErrors, - bool syncExceptions) { + bool checkErrors) { const auto stream = getCurrentCUDAStream(positions.get_device()); bool isCUDAGraphCapturing = isStreamCapturing(stream); - // Advice CUDA on expected usage of the error flag - if (!isCUDAGraphCapturing) { - cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); - cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int), cudaMemAdviseSetReadMostly, 0); - } const CUDAStreamGuard guard(stream); TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); @@ -186,7 +147,6 @@ public: const Tensor deltas = full({num_pairs, 3}, NAN, options); const Tensor distances = full(num_pairs, NAN, options); - tooManyNeighborsErrorFlag = 0; AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { const scalar_t cutoff_ = cutoff.to(); TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive"); @@ -202,29 +162,10 @@ public: get_accessor(distances), get_accessor(box_vectors)); }); - // Check the error flag via cudaLaunchHostFunction so it is compatible with cuda graphs + // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs if(checkErrors){ - static constexpr cudaHostFn_t h_fn = checkTooManyNeighbors; - static std::tuple h_fn_data; - h_fn_data = {max_num_neighbors_, syncExceptions}; - cudaLaunchHostFunc(stream, h_fn, (void*)&h_fn_data); - // With chekErrors=false and syncExceptions=false the state of - // the tooManyErrorsFlag is checked and exceptions are thrown - // asynchronously and in a way compatible with CUDA graphs. - // However, this way of throwing an exception makes it - // uncatchable, crashing the code. - //If checkErrors=true the syncExceptions=true an explicit - // synchronization barrier here forces to throw the exception - // from the main thread, making it catchable at the expense of - // a performance penalty each time the function is called. - //Otherwise, if checkErrors=false, no exception is thrown and - //the user is responsible to check if the number of pairs is - //too high - if (syncExceptions) { - cudaStreamSynchronize(stream); - if (tooManyNeighborsException) - std::rethrow_exception(tooManyNeighborsException); - } + int num_pairs = i_curr_pair.item(); + TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs)); } ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; @@ -266,9 +207,9 @@ public: TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) { m.impl("getNeighborPairs", [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, - const Tensor& box_vectors, const bool &checkErrors, const bool &syncExceptions){ + const Tensor& box_vectors, const bool &checkErrors){ const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, - box_vectors, checkErrors, syncExceptions); + box_vectors, checkErrors); return make_tuple(results[0], results[1], results[2], results[3]); }); } diff --git a/src/pytorch/neighbors/neighbors.cpp b/src/pytorch/neighbors/neighbors.cpp index d5588608..e5911907 100644 --- a/src/pytorch/neighbors/neighbors.cpp +++ b/src/pytorch/neighbors/neighbors.cpp @@ -1,5 +1,5 @@ #include TORCH_LIBRARY(neighbors, m) { - m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors, bool syncExceptions) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)"); + m.def("getNeighborPairs(Tensor positions, Scalar cutoff, Scalar max_num_neighbors, Tensor box_vectors, bool checkErrors) -> (Tensor neighbors, Tensor deltas, Tensor distances, Tensor num_pairs)"); } From 751ee1295963a4ff43ea03bce585b3495cbf3b6e Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Wed, 29 Mar 2023 15:55:09 +0200 Subject: [PATCH 29/41] Remove unused function --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 8ee51b2b..5195b4ec 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -100,13 +100,6 @@ template __global__ void backward_kernel( atomicAdd(&grad_positions[i_atom][i_comp], grad); } - -static bool isStreamCapturing(cudaStream_t st) { - cudaStreamCaptureStatus graphStatus; - cudaStreamIsCapturing(st, &graphStatus); - return graphStatus == cudaStreamCaptureStatusActive; -} - class Autograd : public Function { public: static tensor_list forward(AutogradContext* ctx, @@ -116,7 +109,6 @@ public: const Tensor& box_vectors, bool checkErrors) { const auto stream = getCurrentCUDAStream(positions.get_device()); - bool isCUDAGraphCapturing = isStreamCapturing(stream); const CUDAStreamGuard guard(stream); TORCH_CHECK(positions.dim() == 2, "Expected \"positions\" to have two dimensions"); TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); From ad8bbaf0bfd20a3fb60cc8c4f02b1da8bd3b046a Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 09:58:09 +0200 Subject: [PATCH 30/41] Remove unnecessary synchronization in test --- src/pytorch/neighbors/TestNeighbors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index dfe57a17..09aaec6c 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -150,7 +150,6 @@ def test_too_many_neighbors(device, dtype): with pytest.raises(RuntimeError): # checkErrors = True will raise due to exceeding neighbours getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) - pt.cuda.synchronize() # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False) From 659333171a6d6bd888a43e1f9f95d71455afbaac Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:00:46 +0200 Subject: [PATCH 31/41] Clarify documentation of check_errors --- src/pytorch/neighbors/getNeighborPairs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 4048bf14..2209d174 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -50,9 +50,13 @@ def getNeighborPairs( where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`. If this is omitted, periodic boundary conditions are not applied. check_errors: bool, optional - If set to False the function does not raise due to a number of pairs larger than the maximum. - If set to True, a RuntimeError will be raised in that case. - Defaults to False. + If True, a RuntimeError is raised if more than max_num_neighbors pairs are found. + The error checking requires synchronization, which adds cost and makes this function + incompatible with CUDA graphs. If this argument is False, no error checking is performed. + This makes it faster and compatible with CUDA graphs, but it is your responsibility + to check the return value for number_found_pairs to make sure that no neighbors were missed. + Default: False + Returns ------- neighbors: `torch.Tensor` From 75608cfdd5adefa2ceae1711f9583add70c50738 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:05:43 +0200 Subject: [PATCH 32/41] Clarify documentation of number_found_pairs --- src/pytorch/neighbors/getNeighborPairs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 2209d174..24458728 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -76,9 +76,9 @@ def getNeighborPairs( the distance is set to `NaN`. number_found_pairs: `torch.Tensor` - Contains the total number of pairs found, which might exceed - the requested max_num_neighbors, leaving the rest of the - output in an undefined state. + Contains the total number of pairs found in an unspecified order, + which might exceed the requested max_num_neighbors. In this case, the first number_found_pairs + pairs in the output are valid pairs, but the remaining pairs are omitted. Exceptions ---------- From 4c624e5610d81c422d500c59b54d365e98290d73 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:07:04 +0200 Subject: [PATCH 33/41] Clarify documentation of CUDA graph functionality --- src/pytorch/neighbors/getNeighborPairs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 24458728..90fc6cdf 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -86,10 +86,10 @@ def getNeighborPairs( Note ---- - The operation can be compatible with CUDA Grahps, i.e. the shapes of the output - tensors are independed of the values of input tensors and no synchronizing operation is performed. - - For this to be the case check_errors must be False + The operation can be compatible with CUDA Graphs: the shapes of + the output tensors are independent of the values of input tensors, + and no synchronization is performed. + For this to be true, check_errors must be False. The CUDA implementation returns the atom pairs in non-determinist order, if `max_num_neighbors > 0`. From 355860f2e04034223a8d091bb5083678818a8a54 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:07:30 +0200 Subject: [PATCH 34/41] Remove obsolete comment --- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index 3c669534..44db772b 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -103,7 +103,6 @@ TORCH_LIBRARY_IMPL(neighbors, CPU, m) { m.impl("getNeighborPairs", [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, const Tensor& box_vectors, const bool &checkErrors){ - //The syncExceptions flag is ignored, this function always throws synchronously return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors); }); } From 5ccc98f35a2254104ae701e61519444645d72449 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:12:04 +0200 Subject: [PATCH 35/41] Fix formatting --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 5195b4ec..d00645a3 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -67,13 +67,13 @@ template __global__ void forward_kernel( const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1); //We handle too many neighbors outside of the kernel - if(i_pair < neighbors.size(1)){ - neighbors[0][i_pair] = row; - neighbors[1][i_pair] = column; - deltas[i_pair][0] = delta_x; - deltas[i_pair][1] = delta_y; - deltas[i_pair][2] = delta_z; - distances[i_pair] = sqrt_(distance2); + if (i_pair < neighbors.size(1)) { + neighbors[0][i_pair] = row; + neighbors[1][i_pair] = column; + deltas[i_pair][0] = delta_x; + deltas[i_pair][1] = delta_y; + deltas[i_pair][2] = delta_z; + distances[i_pair] = sqrt_(distance2); } } From bc78d15ab6071f4789bd53b01662b96a5b57aeea Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 10:12:45 +0200 Subject: [PATCH 36/41] Fix formatting --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index d00645a3..a4e5dd3a 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -155,10 +155,10 @@ public: get_accessor(box_vectors)); }); // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs - if(checkErrors){ - int num_pairs = i_curr_pair.item(); - TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs)); - } + if (checkErrors) { + int num_pairs = i_curr_pair.item(); + TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs)); + } ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; return {neighbors, deltas, distances, i_curr_pair}; From e1a965ae3c95aef31358b3df723f92cd6648d8d9 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 11:14:05 +0200 Subject: [PATCH 37/41] Update documentation --- src/pytorch/neighbors/getNeighborPairs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 90fc6cdf..2886b5a7 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -77,8 +77,9 @@ def getNeighborPairs( number_found_pairs: `torch.Tensor` Contains the total number of pairs found in an unspecified order, - which might exceed the requested max_num_neighbors. In this case, the first number_found_pairs - pairs in the output are valid pairs, but the remaining pairs are omitted. + which might exceed the requested max_num_neighbors*num_atoms. + In this case, the first number_found_pairs pairs in the output are + valid pairs, but the remaining pairs are omitted. Exceptions ---------- From 130b13bed8579a098df2b1da2700e2a3d43f6e2c Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 11:14:45 +0200 Subject: [PATCH 38/41] Change the (misleading) num_pairs variable name to max_num_pairs. Enforce that the found number of pairs is less than num_pairs --- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index a4e5dd3a..3b3d8073 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -128,16 +128,16 @@ public: const bool store_all_pairs = max_num_neighbors_ == -1; const int num_atoms = positions.size(0); const int num_all_pairs = num_atoms * (num_atoms - 1) / 2; - const int num_pairs = store_all_pairs ? num_all_pairs : num_atoms * max_num_neighbors_; + const int max_num_pairs = store_all_pairs ? num_all_pairs : (num_atoms * max_num_neighbors_); const int num_threads = 128; const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1); const TensorOptions options = positions.options(); const Tensor i_curr_pair = zeros(1, options.dtype(kInt32)); - const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32)); - const Tensor deltas = full({num_pairs, 3}, NAN, options); - const Tensor distances = full(num_pairs, NAN, options); + const Tensor neighbors = full({2, max_num_pairs}, -1, options.dtype(kInt32)); + const Tensor deltas = full({max_num_pairs, 3}, NAN, options); + const Tensor distances = full(max_num_pairs, NAN, options); AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { const scalar_t cutoff_ = cutoff.to(); @@ -156,8 +156,8 @@ public: }); // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs if (checkErrors) { - int num_pairs = i_curr_pair.item(); - TORCH_CHECK(num_pairs < max_num_neighbors_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_neighbors_), " but found " + std::to_string(num_pairs)); + int num_found_pairs = i_curr_pair.item(); + TORCH_CHECK(num_found_pairs < max_num_pairs, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs), " but found " + std::to_string(num_found_pairs)); } ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; From 2d8d02b3dadb50c222540fa0d84c6959fe98c5e6 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 31 Mar 2023 11:15:40 +0200 Subject: [PATCH 39/41] Add test that checks if the max_num_neighbors per particle is enforced. Right now this does not pass, since the function allows that an atom has more neighbors than max_num_neighbors as long as num_found_pairs Date: Tue, 4 Apr 2023 11:05:22 +0200 Subject: [PATCH 40/41] Change the meaning and name from max_num_neighbors (maximum number of neighbors per particle) to max_num_pairs (maximum number of total pairs). --- src/pytorch/neighbors/TestNeighbors.py | 28 +++++++-------- src/pytorch/neighbors/getNeighborPairs.py | 36 +++++++++---------- src/pytorch/neighbors/getNeighborPairsCPU.cpp | 18 +++++----- src/pytorch/neighbors/getNeighborPairsCUDA.cu | 23 ++++++------ 4 files changed, 51 insertions(+), 54 deletions(-) diff --git a/src/pytorch/neighbors/TestNeighbors.py b/src/pytorch/neighbors/TestNeighbors.py index c0a0f615..7d830812 100644 --- a/src/pytorch/neighbors/TestNeighbors.py +++ b/src/pytorch/neighbors/TestNeighbors.py @@ -56,10 +56,10 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs): # Find the number of neighbors num_neighbors = np.count_nonzero(np.logical_not(np.isnan(ref_distances))) - max_num_neighbors = -1 if all_pairs else max(int(np.ceil(num_neighbors / num_atoms)), 1) + max_num_pairs = -1 if all_pairs else max(int(num_neighbors), 1) # Compute results - neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_neighbors=max_num_neighbors) + neighbors, deltas, distances, _ = getNeighborPairs(positions, cutoff=cutoff, max_num_pairs=max_num_pairs) # Check device assert neighbors.device == positions.device @@ -83,7 +83,7 @@ def test_neighbor_values(device, dtype, num_atoms, cutoff, all_pairs): neighbors, deltas, distances = sort_neighbors(neighbors, deltas, distances) # Resize the reference - ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, num_atoms * max_num_neighbors) + ref_neighbors, ref_deltas, ref_distances = resize_neighbors(ref_neighbors, ref_deltas, ref_distances, max_num_pairs) assert np.all(ref_neighbors == neighbors) assert np.allclose(ref_deltas, deltas, equal_nan=True) @@ -149,25 +149,23 @@ def test_too_many_neighbors(device, dtype): positions = pt.zeros((4, 3,), device=device, dtype=dtype) with pytest.raises(RuntimeError): # checkErrors = True will raise due to exceeding neighbours - getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=True) + getNeighborPairs(positions, cutoff=1, max_num_pairs=1, check_errors=True) # checkErrors = False will never throw due to exceeding neighbours. In addition, the call will be compatible with CUDA graphs - neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_neighbors=1, check_errors=False) + neighbors, deltas, distances, number_found_pairs = getNeighborPairs(positions, cutoff=1, max_num_pairs=1, check_errors=False) assert number_found_pairs == 6 @pytest.mark.parametrize('device', ['cpu', 'cuda']) @pytest.mark.parametrize('dtype', [pt.float32, pt.float64]) -def test_max_neighbors_means_per_particle(device, dtype): +def test_max_pairs_means_total(device, dtype): if not pt.cuda.is_available() and device == 'cuda': pytest.skip('No GPU') # 4 points result into 6 pairs. positions = pt.zeros((4, 3,), device=device, dtype=dtype) with pytest.raises(RuntimeError): # checkErrors = True should raise due to exceeding neighbours - # As of now this will not raise, since 6 Tuple[Tensor, Tensor, Tensor, Tensor]: """Returns indices and distances of atom pairs within a given cutoff distance. - If `max_num_neighbors == -1` (default), all the atom pairs are returned, + If `max_num_pairs == -1` (default), all the atom pairs are returned, i.e. `num_pairs = num_atoms * (num_atoms + 1) / 2`. This is intended for the small molecules, where almost all the atoms are within the cutoff distance of each other. - If `max_num_neighbors > 0`, a fixed number of the atom pair are returned, - i.e. `num_pairs = num_atoms * max_num_neighbors`. This is indeded for large - molecule, where most of the atoms are beyond the cutoff distance of each - other. + If `max_num_pairs > 0`, a fixed number of the atom pairs are + returned. This is indeded for large molecule, where most of the + atoms are beyond the cutoff distance of each other. This function optionally supports periodic boundary conditions with arbitrary triclinic boxes. The box vectors `a`, `b`, and `c` must satisfy @@ -42,15 +41,15 @@ def getNeighborPairs( data type has to be`torch.float32` or `torch.float64`. cutoff: float Maximum distance between atom pairs. - max_num_neighbors: int, optional - Maximum number of neighbors per atom. If set to `-1` (default), + max_num_pairs: int, optional + Maximum number of pairs (total number of neighbors). If set to `-1` (default), all possible combinations of atom pairs are included. box_vectors: `torch.Tensor`, optional The vectors defining the periodic box. This must have shape `(3, 3)`, where `box_vectors[0] = a`, `box_vectors[1] = b`, and `box_vectors[2] = c`. If this is omitted, periodic boundary conditions are not applied. check_errors: bool, optional - If True, a RuntimeError is raised if more than max_num_neighbors pairs are found. + If True, a RuntimeError is raised if more than max_num_pairs pairs are found. The error checking requires synchronization, which adds cost and makes this function incompatible with CUDA graphs. If this argument is False, no error checking is performed. This makes it faster and compatible with CUDA graphs, but it is your responsibility @@ -76,14 +75,15 @@ def getNeighborPairs( the distance is set to `NaN`. number_found_pairs: `torch.Tensor` - Contains the total number of pairs found in an unspecified order, - which might exceed the requested max_num_neighbors*num_atoms. - In this case, the first number_found_pairs pairs in the output are - valid pairs, but the remaining pairs are omitted. + Contains the total number of pairs found. Be aware that if + check_errors is False, this might be larger than + max_num_pairs. In that case, the output tensors contain + only a subset of the pairs that were found, and the others are + omitted. Which pairs get omitted may vary between invocations. Exceptions ---------- - If `max_num_neighbors > 0` and too small, `RuntimeError` is raised if check_errors=True. + If `max_num_pairs > 0` and too small, `RuntimeError` is raised if check_errors=True. Note ---- @@ -93,7 +93,7 @@ def getNeighborPairs( For this to be true, check_errors must be False. The CUDA implementation returns the atom pairs in non-determinist order, - if `max_num_neighbors > 0`. + if `max_num_pairs > 0`. Examples @@ -119,7 +119,7 @@ def getNeighborPairs( [1., 0., 0.]]), tensor([1., nan, 1.]), tensor([3], dtype=torch.int32)) - >>> getNeighborPairs(positions, cutoff=3.0, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE + >>> getNeighborPairs(positions, cutoff=3.0, max_num_pairs=6) # doctest: +NORMALIZE_WHITESPACE (tensor([[ 1, 2, 2, -1, -1, -1], [ 0, 0, 1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.], [2., 0., 0.], @@ -128,7 +128,7 @@ def getNeighborPairs( [nan, nan, nan], [nan, nan, nan]]), tensor([1., 2., 1., nan, nan, nan]), tensor([6], dtype=torch.int32)) - >>> getNeighborPairs(positions, cutoff=1.5, max_num_neighbors=2) # doctest: +NORMALIZE_WHITESPACE + >>> getNeighborPairs(positions, cutoff=1.5, max_num_pairs=6) # doctest: +NORMALIZE_WHITESPACE (tensor([[ 1, 2, -1, -1, -1, -1], [ 0, 1, -1, -1, -1, -1]], dtype=torch.int32), tensor([[1., 0., 0.], [1., 0., 0.], @@ -142,6 +142,6 @@ def getNeighborPairs( if box_vectors is None: box_vectors = empty((0, 0), device=positions.device, dtype=positions.dtype) neighbors, deltas, distances, number_found_pairs = ops.neighbors.getNeighborPairs( - positions, cutoff, max_num_neighbors, box_vectors, check_errors + positions, cutoff, max_num_pairs, box_vectors, check_errors ) return neighbors, deltas, distances, number_found_pairs diff --git a/src/pytorch/neighbors/getNeighborPairsCPU.cpp b/src/pytorch/neighbors/getNeighborPairsCPU.cpp index 44db772b..d63e24b2 100644 --- a/src/pytorch/neighbors/getNeighborPairsCPU.cpp +++ b/src/pytorch/neighbors/getNeighborPairsCPU.cpp @@ -18,7 +18,7 @@ using torch::round; static tuple forward(const Tensor& positions, const Scalar& cutoff, - const Scalar& max_num_neighbors, + const Scalar& max_num_pairs, const Tensor& box_vectors, bool checkErrors) { @@ -48,9 +48,9 @@ static tuple forward(const Tensor& positions, TORCH_CHECK(v[1][1] >= 2*v[2][1], "Invalid box vectors: box_vectors[1][1] < 2*box_vectors[2][1]"); } - const int max_num_neighbors_ = max_num_neighbors.to(); - TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1, - "Expected \"max_num_neighbors\" to be positive or equal to -1"); + const int max_num_pairs_ = max_num_pairs.to(); + TORCH_CHECK(max_num_pairs_ > 0 || max_num_pairs_ == -1, + "Expected \"max_num_pairs\" to be positive or equal to -1"); const int num_atoms = positions.size(0); const int num_pairs = num_atoms * (num_atoms - 1) / 2; @@ -69,7 +69,7 @@ static tuple forward(const Tensor& positions, } Tensor distances = frobenius_norm(deltas, 1); - if (max_num_neighbors_ == -1) { + if (max_num_pairs_ == -1) { const Tensor mask = distances > cutoff; neighbors.index_put_({Slice(), mask}, -1); deltas = deltas.clone(); // Break an autograd loop @@ -83,10 +83,10 @@ static tuple forward(const Tensor& positions, deltas = deltas.index({mask, Slice()}); distances = distances.index({mask}); - const int num_pad = num_atoms * max_num_neighbors_ - distances.size(0); + const int num_pad = max_num_pairs_ - distances.size(0); if (checkErrors) { TORCH_CHECK(num_pad >= 0, - "The maximum number of pairs has been exceed! Increase \"max_num_neighbors\""); + "The maximum number of pairs has been exceed! Increase \"max_num_pairs\""); } if (num_pad > 0) { neighbors = hstack({neighbors, full({2, num_pad}, -1, neighbors.options())}); @@ -101,8 +101,8 @@ static tuple forward(const Tensor& positions, TORCH_LIBRARY_IMPL(neighbors, CPU, m) { m.impl("getNeighborPairs", - [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, + [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_pairs, const Tensor& box_vectors, const bool &checkErrors){ - return forward(positions, cutoff, max_num_neighbors, box_vectors, checkErrors); + return forward(positions, cutoff, max_num_pairs, box_vectors, checkErrors); }); } diff --git a/src/pytorch/neighbors/getNeighborPairsCUDA.cu b/src/pytorch/neighbors/getNeighborPairsCUDA.cu index 3b3d8073..23540af6 100644 --- a/src/pytorch/neighbors/getNeighborPairsCUDA.cu +++ b/src/pytorch/neighbors/getNeighborPairsCUDA.cu @@ -105,7 +105,7 @@ public: static tensor_list forward(AutogradContext* ctx, const Tensor& positions, const Scalar& cutoff, - const Scalar& max_num_neighbors, + const Scalar& max_num_pairs, const Tensor& box_vectors, bool checkErrors) { const auto stream = getCurrentCUDAStream(positions.get_device()); @@ -114,9 +114,8 @@ public: TORCH_CHECK(positions.size(0) > 0, "Expected the 1nd dimension size of \"positions\" to be more than 0"); TORCH_CHECK(positions.size(1) == 3, "Expected the 2nd dimension size of \"positions\" to be 3"); TORCH_CHECK(positions.is_contiguous(), "Expected \"positions\" to be contiguous"); - int max_num_neighbors_ = max_num_neighbors.to(); - TORCH_CHECK(max_num_neighbors_ > 0 || max_num_neighbors_ == -1, - "Expected \"max_num_neighbors\" to be positive or equal to -1"); + TORCH_CHECK(max_num_pairs.toInt() > 0 || max_num_pairs.toInt() == -1, + "Expected \"max_num_pairs\" to be positive or equal to -1"); const bool use_periodic = (box_vectors.size(0) != 0); if (use_periodic) { @@ -125,19 +124,19 @@ public: } // Decide the algorithm - const bool store_all_pairs = max_num_neighbors_ == -1; + const bool store_all_pairs = max_num_pairs.toInt() == -1; const int num_atoms = positions.size(0); const int num_all_pairs = num_atoms * (num_atoms - 1) / 2; - const int max_num_pairs = store_all_pairs ? num_all_pairs : (num_atoms * max_num_neighbors_); + const int max_num_pairs_ = store_all_pairs ? num_all_pairs : (max_num_pairs.toInt()); const int num_threads = 128; const int num_blocks = max((num_all_pairs + num_threads - 1) / num_threads, 1); const TensorOptions options = positions.options(); const Tensor i_curr_pair = zeros(1, options.dtype(kInt32)); - const Tensor neighbors = full({2, max_num_pairs}, -1, options.dtype(kInt32)); - const Tensor deltas = full({max_num_pairs, 3}, NAN, options); - const Tensor distances = full(max_num_pairs, NAN, options); + const Tensor neighbors = full({2, max_num_pairs_}, -1, options.dtype(kInt32)); + const Tensor deltas = full({max_num_pairs_, 3}, NAN, options); + const Tensor distances = full(max_num_pairs_, NAN, options); AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() { const scalar_t cutoff_ = cutoff.to(); @@ -157,7 +156,7 @@ public: // Synchronize and check the number of pairs found. Note that this is incompatible with CUDA graphs if (checkErrors) { int num_found_pairs = i_curr_pair.item(); - TORCH_CHECK(num_found_pairs < max_num_pairs, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs), " but found " + std::to_string(num_found_pairs)); + TORCH_CHECK(num_found_pairs <= max_num_pairs_, "Too many neighbor pairs found. Maximum is " + std::to_string(max_num_pairs_), " but found " + std::to_string(num_found_pairs)); } ctx->save_for_backward({neighbors, deltas, distances}); ctx->saved_data["num_atoms"] = num_atoms; @@ -198,9 +197,9 @@ public: TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) { m.impl("getNeighborPairs", - [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_neighbors, + [](const Tensor& positions, const Scalar& cutoff, const Scalar& max_num_pairs, const Tensor& box_vectors, const bool &checkErrors){ - const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, + const tensor_list results = Autograd::apply(positions, cutoff, max_num_pairs, box_vectors, checkErrors); return make_tuple(results[0], results[1], results[2], results[3]); }); From c97a6f22d8dd596da1258e01409a905fdc525082 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Tue, 11 Apr 2023 13:49:53 +0200 Subject: [PATCH 41/41] Fix typo in comment --- src/pytorch/neighbors/getNeighborPairs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch/neighbors/getNeighborPairs.py b/src/pytorch/neighbors/getNeighborPairs.py index 2538b50d..12a4b03c 100644 --- a/src/pytorch/neighbors/getNeighborPairs.py +++ b/src/pytorch/neighbors/getNeighborPairs.py @@ -17,7 +17,7 @@ def getNeighborPairs( distance of each other. If `max_num_pairs > 0`, a fixed number of the atom pairs are - returned. This is indeded for large molecule, where most of the + returned. This is intended for large molecule, where most of the atoms are beyond the cutoff distance of each other. This function optionally supports periodic boundary conditions with