From f9dc05fc27de9a959ad88cde9653b3255c732065 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 6 Sep 2024 05:27:20 -0400 Subject: [PATCH 1/5] [GraphBolt][CUDA] `gb.cat`. --- graphbolt/include/graphbolt/cuda_ops.h | 10 +++ graphbolt/src/cat.cc | 41 +++++++++++ graphbolt/src/cuda/cat.cu | 80 +++++++++++++++++++++ graphbolt/src/cuda/expand_indptr.cu | 36 +--------- graphbolt/src/cuda/expand_indptr.cuh | 59 +++++++++++++++ graphbolt/src/python_binding.cc | 7 ++ python/dgl/graphbolt/base.py | 38 ++++++++++ tests/python/pytorch/graphbolt/test_base.py | 30 ++++++++ 8 files changed, 266 insertions(+), 35 deletions(-) create mode 100644 graphbolt/src/cat.cc create mode 100644 graphbolt/src/cuda/cat.cu create mode 100644 graphbolt/src/cuda/expand_indptr.cuh diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h index 07feaeb6b5a2..0ca98a8cef83 100644 --- a/graphbolt/include/graphbolt/cuda_ops.h +++ b/graphbolt/include/graphbolt/cuda_ops.h @@ -27,6 +27,16 @@ namespace graphbolt { namespace ops { +/** + * @brief Is equivalent to `torch.cat(tensors, dim=0)`. Works only for + * contiguous tensors. + * + * @param tensors A vector of tensors to be concatenated. + * + * @return torch::cat(tensors, 0). + */ +torch::Tensor CatImpl(const std::vector& tensors); + /** * @brief Sorts the given input and optionally returns the original indexes. * diff --git a/graphbolt/src/cat.cc b/graphbolt/src/cat.cc new file mode 100644 index 000000000000..4a1ece6dab7f --- /dev/null +++ b/graphbolt/src/cat.cc @@ -0,0 +1,41 @@ +/** + * Copyright (c) 2023 by Contributors + * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * @file expand_indptr.cc + * @brief ExpandIndptr operators. + */ +#include +#include + +#include "./macro.h" +#include "./utils.h" + +namespace graphbolt { +namespace ops { + +torch::Tensor Cat(const std::vector& tensors) { + bool all_on_gpu = true; + for (const auto& tensor : tensors) { + all_on_gpu = all_on_gpu && utils::is_on_gpu(tensor); + if (!all_on_gpu) break; + } + if (all_on_gpu) { + GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE( + c10::DeviceType::CUDA, "unique_and_compact", + { return ops::CatImpl(tensors); }); + } + return torch::cat(tensors, 0); +} + +TORCH_LIBRARY_IMPL(graphbolt, CPU, m) { m.impl("cat", &Cat); } + +#ifdef GRAPHBOLT_USE_CUDA +TORCH_LIBRARY_IMPL(graphbolt, CUDA, m) { m.impl("cat", &CatImpl); } +#endif + +TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) { + m.impl("cat", torch::autograd::autogradNotImplementedFallback()); +} + +} // namespace ops +} // namespace graphbolt diff --git a/graphbolt/src/cuda/cat.cu b/graphbolt/src/cuda/cat.cu new file mode 100644 index 000000000000..6b58c9b0e3ad --- /dev/null +++ b/graphbolt/src/cuda/cat.cu @@ -0,0 +1,80 @@ +/** + * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file cuda/expand_indptr.cu + * @brief ExpandIndptr operator implementation on CUDA. + */ +#include +#include + +#include "./common.h" +#include "./expand_indptr.cuh" + +namespace graphbolt { +namespace ops { + +torch::Tensor CatImpl(const std::vector& tensors) { + const int64_t num_batches = tensors.size(); + const int64_t original_feature_size = std::accumulate( + tensors.at(0).sizes().begin() + 1, tensors.at(0).sizes().end(), + tensors.at(0).element_size(), std::multiplies<>()); + auto pointers_and_offsets = torch::empty( + num_batches * 2 + 1, + c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true)); + auto pointers_ptr = + reinterpret_cast(pointers_and_offsets.data_ptr()); + auto offsets_ptr = pointers_and_offsets.data_ptr() + num_batches; + int64_t i = 0; + offsets_ptr[0] = 0; + for (const auto& tensor : tensors) { + pointers_ptr[i++] = reinterpret_cast(tensor.data_ptr()); + offsets_ptr[i] = + offsets_ptr[i - 1] + tensor.size(0) * original_feature_size; + } + auto pointers_and_offsets_dev = torch::empty_like( + pointers_and_offsets, + tensors[0].options().dtype(pointers_and_offsets.scalar_type())); + CUDA_CALL(cudaMemcpyAsync( + pointers_and_offsets_dev.data_ptr(), pointers_ptr, + sizeof(int64_t) * pointers_and_offsets.numel(), cudaMemcpyHostToDevice, + cuda::GetCurrentStream())); + auto shape = tensors[0].sizes().vec(); + shape[0] = offsets_ptr[num_batches] / original_feature_size; + auto output = torch::empty(shape, tensors[0].options()); + auto output_ptr = reinterpret_cast(output.data_ptr()); + + pointers_ptr = + reinterpret_cast(pointers_and_offsets_dev.data_ptr()); + offsets_ptr = pointers_and_offsets_dev.data_ptr() + num_batches; + + thrust::counting_iterator iota(0); + auto output_buffer = thrust::make_transform_iterator( + iota, OutputBufferIndexer{offsets_ptr, output_ptr}); + auto buffer_sizes = thrust::make_transform_iterator( + iota, AdjacentDifference{offsets_ptr}); + + constexpr int64_t max_copy_at_once = std::numeric_limits::max(); + + for (int64_t i = 0; i < num_batches; i += max_copy_at_once) { + CUB_CALL( + DeviceMemcpy::Batched, pointers_ptr + i, output_buffer + i, + buffer_sizes + i, std::min(num_batches - i, max_copy_at_once)); + } + return output; +} + +} // namespace ops +} // namespace graphbolt diff --git a/graphbolt/src/cuda/expand_indptr.cu b/graphbolt/src/cuda/expand_indptr.cu index ebf007a67c3c..e068d71fc85f 100644 --- a/graphbolt/src/cuda/expand_indptr.cu +++ b/graphbolt/src/cuda/expand_indptr.cu @@ -17,49 +17,15 @@ * @file cuda/expand_indptr.cu * @brief ExpandIndptr operator implementation on CUDA. */ -#include -#include -#include - #include #include #include "./common.h" +#include "./expand_indptr.cuh" namespace graphbolt { namespace ops { -template -struct RepeatIndex { - const nodes_t* nodes; - __host__ __device__ auto operator()(indices_t i) { - return thrust::make_constant_iterator(nodes ? nodes[i] : i); - } -}; - -template -struct IotaIndex { - const nodes_t* nodes; - __host__ __device__ auto operator()(indices_t i) { - return thrust::make_counting_iterator(nodes ? nodes[i] : 0); - } -}; - -template -struct OutputBufferIndexer { - const indptr_t* indptr; - indices_t* buffer; - __host__ __device__ auto operator()(int64_t i) { return buffer + indptr[i]; } -}; - -template -struct AdjacentDifference { - const indptr_t* indptr; - __host__ __device__ auto operator()(int64_t i) { - return indptr[i + 1] - indptr[i]; - } -}; - torch::Tensor ExpandIndptrImpl( torch::Tensor indptr, torch::ScalarType dtype, torch::optional nodes, torch::optional output_size, diff --git a/graphbolt/src/cuda/expand_indptr.cuh b/graphbolt/src/cuda/expand_indptr.cuh new file mode 100644 index 000000000000..9252a1cbf0d5 --- /dev/null +++ b/graphbolt/src/cuda/expand_indptr.cuh @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file cuda/expand_indptr.cuh + * @brief ExpandIndptr helper class implementations on CUDA. + */ +#include +#include +#include + +namespace graphbolt { +namespace ops { + +template +struct RepeatIndex { + const nodes_t* nodes; + __host__ __device__ auto operator()(indices_t i) { + return thrust::make_constant_iterator(nodes ? nodes[i] : i); + } +}; + +template +struct IotaIndex { + const nodes_t* nodes; + __host__ __device__ auto operator()(indices_t i) { + return thrust::make_counting_iterator(nodes ? nodes[i] : 0); + } +}; + +template +struct OutputBufferIndexer { + const indptr_t* indptr; + indices_t* buffer; + __host__ __device__ auto operator()(int64_t i) { return buffer + indptr[i]; } +}; + +template +struct AdjacentDifference { + const indptr_t* indptr; + __host__ __device__ auto operator()(int64_t i) { + return indptr[i + 1] - indptr[i]; + } +}; + +} // namespace ops +} // namespace graphbolt diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc index 20c6d59be5d5..e8076ee664b7 100644 --- a/graphbolt/src/python_binding.cc +++ b/graphbolt/src/python_binding.cc @@ -216,6 +216,13 @@ TORCH_LIBRARY(graphbolt, m) { #ifdef HAS_PT2_COMPLIANT_TAG , {at::Tag::pt2_compliant_tag} +#endif + ); + m.def( + "cat(Tensor[] tensors) -> Tensor" +#ifdef HAS_PT2_COMPLIANT_TAG + , + {at::Tag::pt2_compliant_tag} #endif ); } diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py index f0fa3b476619..b722fb41d070 100644 --- a/python/dgl/graphbolt/base.py +++ b/python/dgl/graphbolt/base.py @@ -36,6 +36,7 @@ "EndMarker", "isin", "index_select", + "cat", "expand_indptr", "indptr_edge_ids", "CSCFormatBase", @@ -98,6 +99,43 @@ def isin(elements, test_elements): return torch.ops.graphbolt.isin(elements, test_elements) +if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"): + + torch_fake_decorator = ( + torch.library.impl_abstract + if TorchVersion(torch.__version__) < TorchVersion("2.4.0a0") + else torch.library.register_fake + ) + + @torch_fake_decorator("graphbolt::cat") + def cat_fake(tensors): + """Fake implementation of cat for torch.compile() support.""" + size_0 = sum(t.size(0) for t in tensors) + return tensors[0].new_empty((size_0,) + tensors[0].shape[1:]) + + +def cat(tensors): + """Concatenates the given tensors along the first dimension. + + This is equivalent to + + .. code:: python + + return torch.cat(tensors, dim=0) + + Parameters + ---------- + tensors : List[torch.Tensor] + A list of tensors to be concatenated + + Returns + ------- + torch.Tensor + The concatenated tensors. + """ + return torch.ops.graphbolt.cat(tensors) + + if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"): torch_fake_decorator = ( diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py index 0b058a48ef44..f2eb67dcfadd 100644 --- a/tests/python/pytorch/graphbolt/test_base.py +++ b/tests/python/pytorch/graphbolt/test_base.py @@ -392,6 +392,36 @@ def test_indptr_edge_ids(offset, dtype): assert explanation.graph_break_count == expected_breaks + +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64]) +@pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)]) +def test_cat(dtype, shape): + tensors = [torch.randn((i,) + shape, dtype=dtype, device=F.ctx()) for i in [10, 21, 1]] + torch_result = torch.cat(tensors, dim=0) + gb_result = gb.cat(tensors) + assert torch.equal(torch_result, gb_result) + + if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"): + import torch._dynamo as dynamo + from torch.testing._internal.optests import opcheck + + # Tests torch.compile compatibility + opcheck( + torch.ops.graphbolt.cat, + (tensors,), + test_utils=[ + "test_schema", + "test_autograd_registration", + "test_faketensor", + "test_aot_dispatch_dynamic", + ], + raise_exception=True, + ) + + explanation = dynamo.explain(gb.cat)(tensors) + assert explanation.graph_break_count == 0 + + def test_csc_format_base_representation(): csc_format_base = gb.CSCFormatBase( indptr=torch.tensor([0, 2, 4]), From 0f2f914d6db0151c8a68a0abcfcd4ad64e7b0619 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 6 Sep 2024 05:29:20 -0400 Subject: [PATCH 2/5] use correct license. --- graphbolt/src/cat.cc | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/graphbolt/src/cat.cc b/graphbolt/src/cat.cc index 4a1ece6dab7f..e565b7f822ea 100644 --- a/graphbolt/src/cat.cc +++ b/graphbolt/src/cat.cc @@ -1,8 +1,21 @@ /** - * Copyright (c) 2023 by Contributors - * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) - * @file expand_indptr.cc - * @brief ExpandIndptr operators. + * Copyright (c) 2024, mfbalin (Muhammed Fatih Balin) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file cat.cc + * @brief Concatenation operation. */ #include #include From 105d79d73d0700da995f8d1e1cb5fff5c7dc96e1 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 6 Sep 2024 05:30:02 -0400 Subject: [PATCH 3/5] correct header of the file. --- graphbolt/src/cuda/cat.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphbolt/src/cuda/cat.cu b/graphbolt/src/cuda/cat.cu index 6b58c9b0e3ad..e4552ea94719 100644 --- a/graphbolt/src/cuda/cat.cu +++ b/graphbolt/src/cuda/cat.cu @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. * - * @file cuda/expand_indptr.cu + * @file cuda/cat.cu * @brief ExpandIndptr operator implementation on CUDA. */ #include From aeca7b8677243b860558b3780db1dd7546055914 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 6 Sep 2024 05:31:38 -0400 Subject: [PATCH 4/5] linting --- tests/python/pytorch/graphbolt/test_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py index f2eb67dcfadd..834c21e13839 100644 --- a/tests/python/pytorch/graphbolt/test_base.py +++ b/tests/python/pytorch/graphbolt/test_base.py @@ -396,7 +396,10 @@ def test_indptr_edge_ids(offset, dtype): @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64]) @pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)]) def test_cat(dtype, shape): - tensors = [torch.randn((i,) + shape, dtype=dtype, device=F.ctx()) for i in [10, 21, 1]] + tensors = [ + torch.randn((i,) + shape, dtype=dtype, device=F.ctx()) + for i in [10, 21, 1] + ] torch_result = torch.cat(tensors, dim=0) gb_result = gb.cat(tensors) assert torch.equal(torch_result, gb_result) From 66b893dfb675662ccc377167c3cbfb747a80a8f6 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih Balin Date: Fri, 6 Sep 2024 05:32:41 -0400 Subject: [PATCH 5/5] litning --- tests/python/pytorch/graphbolt/test_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py index 834c21e13839..894d26ee942e 100644 --- a/tests/python/pytorch/graphbolt/test_base.py +++ b/tests/python/pytorch/graphbolt/test_base.py @@ -392,7 +392,6 @@ def test_indptr_edge_ids(offset, dtype): assert explanation.graph_break_count == expected_breaks - @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64]) @pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)]) def test_cat(dtype, shape):