From f9dc05fc27de9a959ad88cde9653b3255c732065 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 6 Sep 2024 05:27:20 -0400
Subject: [PATCH 1/5] [GraphBolt][CUDA] `gb.cat`.

---
 graphbolt/include/graphbolt/cuda_ops.h      | 10 +++
 graphbolt/src/cat.cc                        | 41 +++++++++++
 graphbolt/src/cuda/cat.cu                   | 80 +++++++++++++++++++++
 graphbolt/src/cuda/expand_indptr.cu         | 36 +---------
 graphbolt/src/cuda/expand_indptr.cuh        | 59 +++++++++++++++
 graphbolt/src/python_binding.cc             |  7 ++
 python/dgl/graphbolt/base.py                | 38 ++++++++++
 tests/python/pytorch/graphbolt/test_base.py | 30 ++++++++
 8 files changed, 266 insertions(+), 35 deletions(-)
 create mode 100644 graphbolt/src/cat.cc
 create mode 100644 graphbolt/src/cuda/cat.cu
 create mode 100644 graphbolt/src/cuda/expand_indptr.cuh
diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index 07feaeb6b5a2..0ca98a8cef83 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -27,6 +27,16 @@
 namespace graphbolt {
 namespace ops {
 
+/**
+ * @brief Is equivalent to `torch.cat(tensors, dim=0)`. Works only for
+ * contiguous tensors.
+ *
+ * @param tensors  A vector of tensors to be concatenated.
+ *
+ * @return torch::cat(tensors, 0).
+ */
+torch::Tensor CatImpl(const std::vector<torch::Tensor>& tensors);
+
 /**
  * @brief Sorts the given input and optionally returns the original indexes.
  *
diff --git a/graphbolt/src/cat.cc b/graphbolt/src/cat.cc
new file mode 100644
index 000000000000..4a1ece6dab7f
--- /dev/null
+++ b/graphbolt/src/cat.cc
@@ -0,0 +1,41 @@
+/**
+ *  Copyright (c) 2023 by Contributors
+ *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ * @file expand_indptr.cc
+ * @brief ExpandIndptr operators.
+ */
+#include <graphbolt/cuda_ops.h>
+#include <torch/autograd.h>
+
+#include "./macro.h"
+#include "./utils.h"
+
+namespace graphbolt {
+namespace ops {
+
+torch::Tensor Cat(const std::vector<torch::Tensor>& tensors) {
+  bool all_on_gpu = true;
+  for (const auto& tensor : tensors) {
+    all_on_gpu = all_on_gpu && utils::is_on_gpu(tensor);
+    if (!all_on_gpu) break;
+  }
+  if (all_on_gpu) {
+    GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
+        c10::DeviceType::CUDA, "unique_and_compact",
+        { return ops::CatImpl(tensors); });
+  }
+  return torch::cat(tensors, 0);
+}
+
+TORCH_LIBRARY_IMPL(graphbolt, CPU, m) { m.impl("cat", &Cat); }
+
+#ifdef GRAPHBOLT_USE_CUDA
+TORCH_LIBRARY_IMPL(graphbolt, CUDA, m) { m.impl("cat", &CatImpl); }
+#endif
+
+TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) {
+  m.impl("cat", torch::autograd::autogradNotImplementedFallback());
+}
+
+}  // namespace ops
+}  // namespace graphbolt
diff --git a/graphbolt/src/cuda/cat.cu b/graphbolt/src/cuda/cat.cu
new file mode 100644
index 000000000000..6b58c9b0e3ad
--- /dev/null
+++ b/graphbolt/src/cuda/cat.cu
@@ -0,0 +1,80 @@
+/**
+ *   Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cuda/expand_indptr.cu
+ * @brief ExpandIndptr operator implementation on CUDA.
+ */
+#include <cub/cub.cuh>
+#include <limits>
+
+#include "./common.h"
+#include "./expand_indptr.cuh"
+
+namespace graphbolt {
+namespace ops {
+
+torch::Tensor CatImpl(const std::vector<torch::Tensor>& tensors) {
+  const int64_t num_batches = tensors.size();
+  const int64_t original_feature_size = std::accumulate(
+      tensors.at(0).sizes().begin() + 1, tensors.at(0).sizes().end(),
+      tensors.at(0).element_size(), std::multiplies<>());
+  auto pointers_and_offsets = torch::empty(
+      num_batches * 2 + 1,
+      c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true));
+  auto pointers_ptr =
+      reinterpret_cast<std::byte**>(pointers_and_offsets.data_ptr());
+  auto offsets_ptr = pointers_and_offsets.data_ptr<int64_t>() + num_batches;
+  int64_t i = 0;
+  offsets_ptr[0] = 0;
+  for (const auto& tensor : tensors) {
+    pointers_ptr[i++] = reinterpret_cast<std::byte*>(tensor.data_ptr());
+    offsets_ptr[i] =
+        offsets_ptr[i - 1] + tensor.size(0) * original_feature_size;
+  }
+  auto pointers_and_offsets_dev = torch::empty_like(
+      pointers_and_offsets,
+      tensors[0].options().dtype(pointers_and_offsets.scalar_type()));
+  CUDA_CALL(cudaMemcpyAsync(
+      pointers_and_offsets_dev.data_ptr<int64_t>(), pointers_ptr,
+      sizeof(int64_t) * pointers_and_offsets.numel(), cudaMemcpyHostToDevice,
+      cuda::GetCurrentStream()));
+  auto shape = tensors[0].sizes().vec();
+  shape[0] = offsets_ptr[num_batches] / original_feature_size;
+  auto output = torch::empty(shape, tensors[0].options());
+  auto output_ptr = reinterpret_cast<std::byte*>(output.data_ptr());
+
+  pointers_ptr =
+      reinterpret_cast<std::byte**>(pointers_and_offsets_dev.data_ptr());
+  offsets_ptr = pointers_and_offsets_dev.data_ptr<int64_t>() + num_batches;
+
+  thrust::counting_iterator<int64_t> iota(0);
+  auto output_buffer = thrust::make_transform_iterator(
+      iota, OutputBufferIndexer<int64_t, std::byte>{offsets_ptr, output_ptr});
+  auto buffer_sizes = thrust::make_transform_iterator(
+      iota, AdjacentDifference<int64_t>{offsets_ptr});
+
+  constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
+
+  for (int64_t i = 0; i < num_batches; i += max_copy_at_once) {
+    CUB_CALL(
+        DeviceMemcpy::Batched, pointers_ptr + i, output_buffer + i,
+        buffer_sizes + i, std::min(num_batches - i, max_copy_at_once));
+  }
+  return output;
+}
+
+}  // namespace ops
+}  // namespace graphbolt
diff --git a/graphbolt/src/cuda/expand_indptr.cu b/graphbolt/src/cuda/expand_indptr.cu
index ebf007a67c3c..e068d71fc85f 100644
--- a/graphbolt/src/cuda/expand_indptr.cu
+++ b/graphbolt/src/cuda/expand_indptr.cu
@@ -17,49 +17,15 @@
  * @file cuda/expand_indptr.cu
  * @brief ExpandIndptr operator implementation on CUDA.
  */
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
 #include <cub/cub.cuh>
 #include <limits>
 
 #include "./common.h"
+#include "./expand_indptr.cuh"
 
 namespace graphbolt {
 namespace ops {
 
-template <typename indices_t, typename nodes_t>
-struct RepeatIndex {
-  const nodes_t* nodes;
-  __host__ __device__ auto operator()(indices_t i) {
-    return thrust::make_constant_iterator(nodes ? nodes[i] : i);
-  }
-};
-
-template <typename indices_t, typename nodes_t>
-struct IotaIndex {
-  const nodes_t* nodes;
-  __host__ __device__ auto operator()(indices_t i) {
-    return thrust::make_counting_iterator(nodes ? nodes[i] : 0);
-  }
-};
-
-template <typename indptr_t, typename indices_t>
-struct OutputBufferIndexer {
-  const indptr_t* indptr;
-  indices_t* buffer;
-  __host__ __device__ auto operator()(int64_t i) { return buffer + indptr[i]; }
-};
-
-template <typename indptr_t>
-struct AdjacentDifference {
-  const indptr_t* indptr;
-  __host__ __device__ auto operator()(int64_t i) {
-    return indptr[i + 1] - indptr[i];
-  }
-};
-
 torch::Tensor ExpandIndptrImpl(
     torch::Tensor indptr, torch::ScalarType dtype,
     torch::optional<torch::Tensor> nodes, torch::optional<int64_t> output_size,
diff --git a/graphbolt/src/cuda/expand_indptr.cuh b/graphbolt/src/cuda/expand_indptr.cuh
new file mode 100644
index 000000000000..9252a1cbf0d5
--- /dev/null
+++ b/graphbolt/src/cuda/expand_indptr.cuh
@@ -0,0 +1,59 @@
+/**
+ *   Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cuda/expand_indptr.cuh
+ * @brief ExpandIndptr helper class implementations on CUDA.
+ */
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace graphbolt {
+namespace ops {
+
+template <typename indices_t, typename nodes_t>
+struct RepeatIndex {
+  const nodes_t* nodes;
+  __host__ __device__ auto operator()(indices_t i) {
+    return thrust::make_constant_iterator(nodes ? nodes[i] : i);
+  }
+};
+
+template <typename indices_t, typename nodes_t>
+struct IotaIndex {
+  const nodes_t* nodes;
+  __host__ __device__ auto operator()(indices_t i) {
+    return thrust::make_counting_iterator(nodes ? nodes[i] : 0);
+  }
+};
+
+template <typename indptr_t, typename indices_t>
+struct OutputBufferIndexer {
+  const indptr_t* indptr;
+  indices_t* buffer;
+  __host__ __device__ auto operator()(int64_t i) { return buffer + indptr[i]; }
+};
+
+template <typename indptr_t>
+struct AdjacentDifference {
+  const indptr_t* indptr;
+  __host__ __device__ auto operator()(int64_t i) {
+    return indptr[i + 1] - indptr[i];
+  }
+};
+
+}  // namespace ops
+}  // namespace graphbolt
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 20c6d59be5d5..e8076ee664b7 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -216,6 +216,13 @@ TORCH_LIBRARY(graphbolt, m) {
 #ifdef HAS_PT2_COMPLIANT_TAG
       ,
       {at::Tag::pt2_compliant_tag}
+#endif
+  );
+  m.def(
+      "cat(Tensor[] tensors) -> Tensor"
+#ifdef HAS_PT2_COMPLIANT_TAG
+      ,
+      {at::Tag::pt2_compliant_tag}
 #endif
   );
 }
diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
index f0fa3b476619..b722fb41d070 100644
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -36,6 +36,7 @@
     "EndMarker",
     "isin",
     "index_select",
+    "cat",
     "expand_indptr",
     "indptr_edge_ids",
     "CSCFormatBase",
@@ -98,6 +99,43 @@ def isin(elements, test_elements):
     return torch.ops.graphbolt.isin(elements, test_elements)
 
 
+if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"):
+
+    torch_fake_decorator = (
+        torch.library.impl_abstract
+        if TorchVersion(torch.__version__) < TorchVersion("2.4.0a0")
+        else torch.library.register_fake
+    )
+
+    @torch_fake_decorator("graphbolt::cat")
+    def cat_fake(tensors):
+        """Fake implementation of cat for torch.compile() support."""
+        size_0 = sum(t.size(0) for t in tensors)
+        return tensors[0].new_empty((size_0,) + tensors[0].shape[1:])
+
+
+def cat(tensors):
+    """Concatenates the given tensors along the first dimension.
+
+    This is equivalent to
+
+    .. code:: python
+
+       return torch.cat(tensors, dim=0)
+
+    Parameters
+    ----------
+    tensors : List[torch.Tensor]
+        A list of tensors to be concatenated
+
+    Returns
+    -------
+    torch.Tensor
+        The concatenated tensors.
+    """
+    return torch.ops.graphbolt.cat(tensors)
+
+
 if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"):
 
     torch_fake_decorator = (
diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
index 0b058a48ef44..f2eb67dcfadd 100644
--- a/tests/python/pytorch/graphbolt/test_base.py
+++ b/tests/python/pytorch/graphbolt/test_base.py
@@ -392,6 +392,36 @@ def test_indptr_edge_ids(offset, dtype):
             assert explanation.graph_break_count == expected_breaks
 
 
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
+@pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)])
+def test_cat(dtype, shape):
+    tensors = [torch.randn((i,) + shape, dtype=dtype, device=F.ctx()) for i in [10, 21, 1]]
+    torch_result = torch.cat(tensors, dim=0)
+    gb_result = gb.cat(tensors)
+    assert torch.equal(torch_result, gb_result)
+
+    if TorchVersion(torch.__version__) >= TorchVersion("2.2.0a0"):
+        import torch._dynamo as dynamo
+        from torch.testing._internal.optests import opcheck
+
+        # Tests torch.compile compatibility
+        opcheck(
+            torch.ops.graphbolt.cat,
+            (tensors,),
+            test_utils=[
+                "test_schema",
+                "test_autograd_registration",
+                "test_faketensor",
+                "test_aot_dispatch_dynamic",
+            ],
+            raise_exception=True,
+        )
+
+        explanation = dynamo.explain(gb.cat)(tensors)
+        assert explanation.graph_break_count == 0
+
+
 def test_csc_format_base_representation():
     csc_format_base = gb.CSCFormatBase(
         indptr=torch.tensor([0, 2, 4]),

From 0f2f914d6db0151c8a68a0abcfcd4ad64e7b0619 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 6 Sep 2024 05:29:20 -0400
Subject: [PATCH 2/5] use correct license.

---
 graphbolt/src/cat.cc | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/graphbolt/src/cat.cc b/graphbolt/src/cat.cc
index 4a1ece6dab7f..e565b7f822ea 100644
--- a/graphbolt/src/cat.cc
+++ b/graphbolt/src/cat.cc
@@ -1,8 +1,21 @@
 /**
- *  Copyright (c) 2023 by Contributors
- *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
- * @file expand_indptr.cc
- * @brief ExpandIndptr operators.
+ *   Copyright (c) 2024, mfbalin (Muhammed Fatih Balin)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cat.cc
+ * @brief Concatenation operation.
  */
 #include <graphbolt/cuda_ops.h>
 #include <torch/autograd.h>

From 105d79d73d0700da995f8d1e1cb5fff5c7dc96e1 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 6 Sep 2024 05:30:02 -0400
Subject: [PATCH 3/5] correct header of the file.

---
 graphbolt/src/cuda/cat.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphbolt/src/cuda/cat.cu b/graphbolt/src/cuda/cat.cu
index 6b58c9b0e3ad..e4552ea94719 100644
--- a/graphbolt/src/cuda/cat.cu
+++ b/graphbolt/src/cuda/cat.cu
@@ -14,7 +14,7 @@
  *   See the License for the specific language governing permissions and
  *   limitations under the License.
  *
- * @file cuda/expand_indptr.cu
+ * @file cuda/cat.cu
  * @brief ExpandIndptr operator implementation on CUDA.
  */
 #include <cub/cub.cuh>

From aeca7b8677243b860558b3780db1dd7546055914 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 6 Sep 2024 05:31:38 -0400
Subject: [PATCH 4/5] linting

---
 tests/python/pytorch/graphbolt/test_base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
index f2eb67dcfadd..834c21e13839 100644
--- a/tests/python/pytorch/graphbolt/test_base.py
+++ b/tests/python/pytorch/graphbolt/test_base.py
@@ -396,7 +396,10 @@ def test_indptr_edge_ids(offset, dtype):
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
 @pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)])
 def test_cat(dtype, shape):
-    tensors = [torch.randn((i,) + shape, dtype=dtype, device=F.ctx()) for i in [10, 21, 1]]
+    tensors = [
+        torch.randn((i,) + shape, dtype=dtype, device=F.ctx())
+        for i in [10, 21, 1]
+    ]
     torch_result = torch.cat(tensors, dim=0)
     gb_result = gb.cat(tensors)
     assert torch.equal(torch_result, gb_result)

From 66b893dfb675662ccc377167c3cbfb747a80a8f6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih Balin <m.f.balin@gmail.com>
Date: Fri, 6 Sep 2024 05:32:41 -0400
Subject: [PATCH 5/5] litning

---
 tests/python/pytorch/graphbolt/test_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
index 834c21e13839..894d26ee942e 100644
--- a/tests/python/pytorch/graphbolt/test_base.py
+++ b/tests/python/pytorch/graphbolt/test_base.py
@@ -392,7 +392,6 @@ def test_indptr_edge_ids(offset, dtype):
             assert explanation.graph_break_count == expected_breaks
 
 
-
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
 @pytest.mark.parametrize("shape", [tuple(), (13, 17), (5,)])
 def test_cat(dtype, shape):