Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add error checking to CUDA version of getNeighborPairs #80

Merged
merged 46 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
657e748
Add error checking to CUDA version of getNeighborPairs
RaulPPelaez Jan 16, 2023
5339556
Add a new bool optional parameter to getNeighborPairs, setting it to …
RaulPPelaez Jan 19, 2023
8c13952
Remove unnecessarily static variable
RaulPPelaez Jan 19, 2023
928f123
Change the error handling of getNeighborPairs.
RaulPPelaez Mar 3, 2023
477e9cd
Make getNeighborPairs CUDA-graph compatible, add test for it
RaulPPelaez Mar 6, 2023
822c691
Remove incorrect comment
RaulPPelaez Mar 6, 2023
e46fe2d
Change not by !
RaulPPelaez Mar 6, 2023
e80cd5e
Move all torch.ops.load calls to the __init__.py scripts
RaulPPelaez Mar 7, 2023
2a7cd3a
Change how the location of libNNPOpsPyTorch.so is found at __init__ s…
RaulPPelaez Mar 9, 2023
e4df3cf
Remove spurious lines in CMakeLists.txt
RaulPPelaez Mar 9, 2023
d6eb763
Update again how libNNPOpsPyTorch.so is found in __init__.py
RaulPPelaez Mar 9, 2023
ca821c3
Remove redundant torch load
RaulPPelaez Mar 9, 2023
46ddf3d
Merge remote-tracking branch 'origin/master'
RaulPPelaez Mar 9, 2023
676f83b
Skip CUDA graph test if no GPU is available
RaulPPelaez Mar 9, 2023
d05656b
Remove incorrect path in __init__
RaulPPelaez Mar 14, 2023
4fb4b2e
Use relative path to load NNPOps library in __init__.py
RaulPPelaez Mar 15, 2023
bf56580
Copy test scripts to build directory, run them there
RaulPPelaez Mar 15, 2023
947f4d8
Remove unnecessary import
RaulPPelaez Mar 15, 2023
a258786
Merge branch 'fix_torch_load' into cuda_graphs
RaulPPelaez Mar 15, 2023
ae82f90
Some fixes for CUDA graph support in getNEighborPairs
RaulPPelaez Mar 17, 2023
4625684
Reverse logic for check_errors in getNeighborPairs.py
RaulPPelaez Mar 22, 2023
1376a5e
Merge branch 'cuda_graphs'
RaulPPelaez Mar 22, 2023
d711a3c
Reverse check_errors flag in the rest of the getNeighborPair-related …
RaulPPelaez Mar 22, 2023
400ceed
Clarify documentation on the error raised by getNeighborPairs
RaulPPelaez Mar 22, 2023
c36243b
Always return the number of found pairs in getNeighborPairs
RaulPPelaez Mar 22, 2023
5552a89
Merge remote-tracking branch 'origin/master'
RaulPPelaez Mar 22, 2023
8da1c5d
Revert "Always return the number of found pairs in getNeighborPairs"
RaulPPelaez Mar 22, 2023
829ee5b
Fix check_error interpretation in getNeighborPairs.py
RaulPPelaez Mar 22, 2023
73c3e58
Add return number of pairs functionality again
RaulPPelaez Mar 23, 2023
c2210f3
Update tests with new getNeighborPairs interface
RaulPPelaez Mar 23, 2023
fba2b46
Fix type decorator preventing jit.script from working on getNeighborP…
RaulPPelaez Mar 23, 2023
562d522
Remove sync_exceptions flag, simplifying the behavior and relation
RaulPPelaez Mar 29, 2023
751ee12
Remove unused function
RaulPPelaez Mar 29, 2023
ad8bbaf
Remove unnecessary synchronization in test
RaulPPelaez Mar 31, 2023
6593331
Clarify documentation of check_errors
RaulPPelaez Mar 31, 2023
75608cf
Clarify documentation of number_found_pairs
RaulPPelaez Mar 31, 2023
4c624e5
Clarify documentation of CUDA graph functionality
RaulPPelaez Mar 31, 2023
355860f
Remove obsolete comment
RaulPPelaez Mar 31, 2023
5ccc98f
Fix formatting
RaulPPelaez Mar 31, 2023
bc78d15
Fix formatting
RaulPPelaez Mar 31, 2023
e1a965a
Update documentation
RaulPPelaez Mar 31, 2023
130b13b
Change the (misleading) num_pairs variable name to max_num_pairs.
RaulPPelaez Mar 31, 2023
2d8d02b
Add test that checks if the max_num_neighbors per particle is
RaulPPelaez Mar 31, 2023
6a67cad
Merge remote-tracking branch 'origin/master'
RaulPPelaez Mar 31, 2023
90a584e
Change the meaning and name from max_num_neighbors (maximum number of…
RaulPPelaez Apr 4, 2023
c97a6f2
Fix typo in comment
RaulPPelaez Apr 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/pytorch/neighbors/TestNeighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_neighbor_grads(dtype, num_atoms, grad):
# The following test is only run on the CPU. Running it on the GPU triggers a
# CUDA assertion, which causes all tests run after it to fail.

@pytest.mark.parametrize('device', ['cpu'])
@pytest.mark.parametrize('device', ['cpu', 'cuda'])
@pytest.mark.parametrize('dtype', [pt.float32, pt.float64])
def test_too_many_neighbors(device, dtype):

Expand Down
28 changes: 25 additions & 3 deletions src/pytorch/neighbors/getNeighborPairsCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ template <typename scalar_t> __device__ __forceinline__ scalar_t sqrt_(scalar_t
template<> __device__ __forceinline__ float sqrt_(float x) { return ::sqrtf(x); };
template<> __device__ __forceinline__ double sqrt_(double x) { return ::sqrt(x); };

__device__ __managed__ int32_t tooManyNeighborsErrorFlag; //Error flag for forward_kernel

template <typename scalar_t> __global__ void forward_kernel(
const int32_t num_all_pairs,
const Accessor<scalar_t, 2> positions,
Expand Down Expand Up @@ -64,7 +66,12 @@ template <typename scalar_t> __global__ void forward_kernel(
if (distance2 > cutoff2) return;

const int32_t i_pair = store_all_pairs ? index : atomicAdd(&i_curr_pair[0], 1);
assert(i_pair < neighbors.size(1));
//If the maximum number of neighbours is surpassed encode the
//number of pairs found in a flag and exit
if(i_pair >= neighbors.size(1)){
atomicMin(&tooManyNeighborsErrorFlag, -i_pair);
return;
}

neighbors[0][i_pair] = row;
neighbors[1][i_pair] = column;
Expand Down Expand Up @@ -135,11 +142,19 @@ public:
const Tensor neighbors = full({2, num_pairs}, -1, options.dtype(kInt32));
const Tensor deltas = full({num_pairs, 3}, NAN, options);
const Tensor distances = full(num_pairs, NAN, options);

//Advice CUDA on expected usage of the error flag
//cudaStreamAttachMemAsync(stream, &tooManyNeighborsErrorFlag);
cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
cudaMemAdviseSetAccessedBy, cudaCpuDeviceId);
cudaMemAdvise(&tooManyNeighborsErrorFlag, sizeof(int),
cudaMemAdviseSetReadMostly, 0);
AT_DISPATCH_FLOATING_TYPES(positions.scalar_type(), "getNeighborPairs::forward", [&]() {
const CUDAStreamGuard guard(stream);
tooManyNeighborsErrorFlag = 0;
const scalar_t cutoff_ = cutoff.to<scalar_t>();
TORCH_CHECK(cutoff_ > 0, "Expected \"cutoff\" to be positive");
cudaEvent_t event;
cudaEventCreateWithFlags(&event, cudaEventDisableTiming | cudaEventBlockingSync);
forward_kernel<<<num_blocks, num_threads, 0, stream>>>(
num_all_pairs,
get_accessor<scalar_t, 2>(positions),
Expand All @@ -151,6 +166,13 @@ public:
get_accessor<scalar_t, 2>(deltas),
get_accessor<scalar_t, 1>(distances),
get_accessor<scalar_t, 2>(box_vectors));
cudaEventRecord(event, stream);
cudaEventSynchronize(event);
raimis marked this conversation as resolved.
Show resolved Hide resolved
//Check the error flag
TORCH_CHECK(tooManyNeighborsErrorFlag == 0, "Some particle has too many neighbours, found " +
raimis marked this conversation as resolved.
Show resolved Hide resolved
std::to_string(-tooManyNeighborsErrorFlag) + " but max is " +
std::to_string(max_num_neighbors.toInt()));
cudaEventDestroy(event);
});

ctx->save_for_backward({neighbors, deltas, distances});
Expand Down Expand Up @@ -197,4 +219,4 @@ TORCH_LIBRARY_IMPL(neighbors, AutogradCUDA, m) {
const tensor_list results = Autograd::apply(positions, cutoff, max_num_neighbors, box_vectors);
return make_tuple(results[0], results[1], results[2]);
});
}
}