From 0978ce6571e165b010e2eadf9695f7580548af0f Mon Sep 17 00:00:00 2001
From: "Hongzhi (Steve), Chen" <chenhongzhi.nkcs@gmail.com>
Date: Thu, 8 Aug 2024 13:01:30 +0800
Subject: [PATCH 01/78] Reorder dependency list in setup (#7672)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
---
 python/setup.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index 0f7fcac9b89f..bba00b39aad7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -220,16 +220,16 @@ def get_lib_file_path(lib_name, backend=""):
 
 # Configure dependencies.
 install_requires = [
-    "numpy>=1.14.0",
-    "scipy>=1.1.0",
     "networkx>=2.1",
-    "requests>=2.19.0",
-    "tqdm",
-    "psutil>=5.8.0",
-    "pandas",
+    "numpy>=1.14.0",
     "packaging",
-    "pyyaml",
+    "pandas",
+    "psutil>=5.8.0",
     "pydantic>=2.0",
+    "pyyaml",
+    "requests>=2.19.0",
+    "scipy>=1.1.0",
+    "tqdm",
 ]
 
 setup(

From 7aa8a5029c877b32646d1f20c26cb27728fc1aa7 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Wed, 7 Aug 2024 22:04:33 -0700
Subject: [PATCH 02/78] [Bug] Fixing Bug in
 `test_change_etype_to_canonical_etype.py::test_hetero_graph` Test. (#7622)

---
 tools/distpartitioning/data_shuffle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/distpartitioning/data_shuffle.py b/tools/distpartitioning/data_shuffle.py
index e85bff5ecc4c..7cba2cbeecda 100644
--- a/tools/distpartitioning/data_shuffle.py
+++ b/tools/distpartitioning/data_shuffle.py
@@ -472,8 +472,8 @@ def exchange_feature(
         )
 
     # exchange actual data here.
-    logging.debug(f"Rank: {rank} {featdata_key.shape=}")
     if featdata_key is not None:
+        logging.debug(f"Rank: {rank} {featdata_key.shape=}")
         feat_dims_dtype = list(featdata_key.shape)
         assert (
             len(featdata_key.shape) == 2 or len(featdata_key.shape) == 1

From 83595e65880c1d819c9b28bf08ce3e6d10313d19 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Wed, 7 Aug 2024 22:06:53 -0700
Subject: [PATCH 03/78] [WARNINGS] Removing warnings appearing in several
 `distributed` tests. (#7627)

---
 tests/distributed/test_distributed_sampling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/distributed/test_distributed_sampling.py b/tests/distributed/test_distributed_sampling.py
index 5aabb4a9defc..4ca8f7b130ac 100644
--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -512,7 +512,7 @@ def start_hetero_sample_client(
     assert "feat" not in dist_graph.nodes["n2"].data
     assert "feat" not in dist_graph.nodes["n3"].data
     nodes = {
-        k: torch.tensor(v, dtype=dist_graph.idtype) for k, v in nodes.items()
+        k: v.type(dist_graph.idtype).clone().detach() for k, v in nodes.items()
     }
     if gpb is None:
         gpb = dist_graph.get_partition_book()
@@ -553,7 +553,7 @@ def start_hetero_etype_sample_client(
     assert "feat" not in dist_graph.nodes["n2"].data
     assert "feat" not in dist_graph.nodes["n3"].data
     nodes = {
-        k: torch.tensor(v, dtype=dist_graph.idtype) for k, v in nodes.items()
+        k: v.type(dist_graph.idtype).clone().detach() for k, v in nodes.items()
     }
 
     if (not use_graphbolt) and dist_graph.local_partition is not None:
@@ -915,7 +915,7 @@ def start_bipartite_sample_client(
     assert "feat" in dist_graph.nodes["user"].data
     assert "feat" in dist_graph.nodes["game"].data
     nodes = {
-        k: torch.tensor(v, dtype=dist_graph.idtype) for k, v in nodes.items()
+        k: v.type(dist_graph.idtype).clone().detach() for k, v in nodes.items()
     }
     if gpb is None:
         gpb = dist_graph.get_partition_book()
@@ -951,7 +951,7 @@ def start_bipartite_etype_sample_client(
     assert "feat" in dist_graph.nodes["user"].data
     assert "feat" in dist_graph.nodes["game"].data
     nodes = {
-        k: torch.tensor(v, dtype=dist_graph.idtype) for k, v in nodes.items()
+        k: v.type(dist_graph.idtype).clone().detach() for k, v in nodes.items()
     }
 
     if not use_graphbolt and dist_graph.local_partition is not None:

From 82dc0809f911f1cfb821bb131e9184d1421939a9 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:32:17 +0800
Subject: [PATCH 04/78] [dev] do not import distributed and graphbolt when
 import dgl (#7676)

---
 python/dgl/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/dgl/__init__.py b/python/dgl/__init__.py
index 7adce476ef29..17066eba0bd9 100644
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -62,6 +62,3 @@
 from .mpops import *
 from .homophily import *
 from .label_informativeness import *
-
-if backend_name == "pytorch":
-    from . import distributed

From 00c50af3d782d4cf6a579c173c88b02c743bb466 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:32:33 +0800
Subject: [PATCH 05/78] [Dist] enable partition tests (#7675)

---
 tests/scripts/task_distributed_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_distributed_test.sh b/tests/scripts/task_distributed_test.sh
index ab6fcf7285e9..975a3729d942 100644
--- a/tests/scripts/task_distributed_test.sh
+++ b/tests/scripts/task_distributed_test.sh
@@ -34,8 +34,8 @@ export PYTHONUNBUFFERED=1
 export OMP_NUM_THREADS=1
 export DMLC_LOG_DEBUG=1
 
-# Tests for distributed are skipped due to glitch @2024.06.27.
-#python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/*.py || fail "distributed"
+# Tests for distributed except test_partition.py are skipped due to glitch @2024.06.27.
+python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/test_partition.py || fail "distributed"
 
 # Tests for tools are skipped due to glitch.
 #PYTHONPATH=tools:tools/distpartitioning:$PYTHONPATH python3 -m pytest -v --capture=tee-sys --junitxml=pytest_tools.xml --durations=100 tests/tools/*.py || fail "tools"

From bea7eea9254a9add0187c163e07912b189e65a88 Mon Sep 17 00:00:00 2001
From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:34:57 +0800
Subject: [PATCH 06/78] Update Jenkinsfile (#7674)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-126.us-west-2.compute.internal>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8328384e7e11..468721e4f46c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -162,7 +162,7 @@ def is_authorized(name) {
     'nv-dlasalle', 'yaox12', 'chang-l', 'Kh4L', 'VibhuJawa', 'kkranen',
     'TristonC', 'mfbalin',
     'bgawrych', 'itaraban', 'daniil-sizov', 'anko-intel', 'Kacper-Pietkun',
-    'hankaj', 'agrabows', 'DominikaJedynak', 'RafLit',
+    'hankaj', 'agrabows', 'DominikaJedynak', 'RafLit', 'CfromBU',
     // Emeritus:
     'VoVAllen',
   ]

From 6d55515dcdb815ea7d8d5edd384bb8b55b7018b5 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 9 Aug 2024 08:03:04 -0400
Subject: [PATCH 07/78] [GraphBolt][CUDA][Temporal] Complete implementation.
 (#7677)

---
 .../include/graphbolt/cuda_sampling_ops.h     |  19 ++-
 .../graphbolt/fused_csc_sampling_graph.h      |  27 ++--
 graphbolt/src/cuda/neighbor_sampler.cu        | 136 +++++++++++++++---
 graphbolt/src/fused_csc_sampling_graph.cc     |  93 ++++++++----
 .../impl/fused_csc_sampling_graph.py          | 102 +++++++------
 .../impl/temporal_neighbor_sampler.py         |   9 +-
 python/dgl/graphbolt/internal/sample_utils.py |  11 +-
 7 files changed, 288 insertions(+), 109 deletions(-)

diff --git a/graphbolt/include/graphbolt/cuda_sampling_ops.h b/graphbolt/include/graphbolt/cuda_sampling_ops.h
index 3d22204e2e7f..f9d19fdb0ec1 100644
--- a/graphbolt/include/graphbolt/cuda_sampling_ops.h
+++ b/graphbolt/include/graphbolt/cuda_sampling_ops.h
@@ -69,6 +69,16 @@ namespace ops {
  * @param random_seed The random seed for the sampler for layer=True.
  * @param seed2_contribution The contribution of the second random seed, [0, 1)
  * for layer=True.
+ * @param seeds_timestamp The timestamp of the seeds.
+ * @param seeds_pre_time_window The time window of the seeds represents a period
+ * of time before `seeds_timestamp`. If provided, only neighbors and related
+ * edges whose timestamps fall within
+ * `[seeds_timestamp - seeds_pre_time_window, seeds_timestamp]` will be
+ * filtered.
+ * @param node_timestamp An optional tensor that contains the timestamp of nodes
+ * in the graph.
+ * @param edge_timestamp An optional tensor that contains the timestamp of edges
+ * in the graph.
  *
  * @return An intrusive pointer to a FusedSampledSubgraph object containing
  * the sampled graph's information.
@@ -87,7 +97,14 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
     torch::optional<torch::Dict<std::string, int64_t>> edge_type_to_id =
         torch::nullopt,
     torch::optional<torch::Tensor> random_seed = torch::nullopt,
-    float seed2_contribution = .0f);
+    float seed2_contribution = .0f,
+    // Optional temporal sampling arguments begin.
+    torch::optional<torch::Tensor> seeds_timestamp = torch::nullopt,
+    torch::optional<torch::Tensor> seeds_pre_time_window = torch::nullopt,
+    torch::optional<torch::Tensor> node_timestamp = torch::nullopt,
+    torch::optional<torch::Tensor> edge_timestamp = torch::nullopt
+    // Optional temporal sampling arguments end.
+);
 
 /**
  * @brief Return the subgraph induced on the inbound edges of the given nodes.
diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
index 652b76889644..6b6b076d5934 100644
--- a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
+++ b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
@@ -368,8 +368,10 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
    * given, the sampled neighbors or edges of an input node must have a
    * timestamp that is smaller than that of the input node.
    *
-   * @param nodes The nodes from which to sample neighbors.
-   * @param input_nodes_timestamp The timestamp of the nodes.
+   * @param seeds The seeds nodes from which to sample neighbors.
+   * @param seed_offsets The offsets of the given seeds,
+   * seeds[seed_offsets[i]: seed_offsets[i + 1]] has node type id i.
+   * @param seeds_timestamp The timestamp of the nodes.
    * @param fanouts The number of edges to be sampled for each node with or
    * without considering edge types, following the same rules as in
    * SampleNeighbors.
@@ -379,11 +381,13 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
    * @param layer Boolean indicating whether neighbors should be sampled in a
    * layer sampling fashion. Uses the LABOR-0 algorithm to increase overlap of
    * sampled edges, see arXiv:2210.13339.
-   * @param input_nodes_pre_time_window The time window of the nodes represents
-   * a period of time before `input_nodes_timestamp`. If provided, only
+   * @param returning_indices_is_optional Boolean indicating whether returning
+   * indices tensor is optional.
+   * @param seeds_pre_time_window The time window of the seed nodes represents
+   * a period of time before `seeds_timestamp`. If provided, only
    * neighbors and related edges whose timestamps fall within
-   * `[input_nodes_timestamp - input_nodes_pre_time_window,
-   * input_nodes_timestamp]` will be filtered.
+   * `[seeds_timestamp - seeds_pre_time_window, seeds_timestamp]` will be
+   * filtered.
    * @param probs_or_mask An optional edge attribute tensor for probablities
    * or masks, following the same rules as in SampleNeighbors.
    * @param node_timestamp_attr_name An optional string specifying the name of
@@ -396,10 +400,11 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
    *
    */
   c10::intrusive_ptr<FusedSampledSubgraph> TemporalSampleNeighbors(
-      const torch::Tensor& input_nodes,
-      const torch::Tensor& input_nodes_timestamp,
-      const std::vector<int64_t>& fanouts, bool replace, bool layer,
-      torch::optional<torch::Tensor> input_nodes_pre_time_window,
+      const torch::optional<torch::Tensor>& seeds,
+      const torch::optional<std::vector<int64_t>>& seed_offsets,
+      const torch::Tensor& seeds_timestamp, const std::vector<int64_t>& fanouts,
+      bool replace, bool layer, bool returning_indices_is_optional,
+      torch::optional<torch::Tensor> seeds_pre_time_window,
       torch::optional<torch::Tensor> probs_or_mask,
       torch::optional<std::string> node_timestamp_attr_name,
       torch::optional<std::string> edge_timestamp_attr_name,
@@ -442,7 +447,7 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
   template <TemporalOption Temporal, typename NumPickFn, typename PickFn>
   c10::intrusive_ptr<FusedSampledSubgraph> SampleNeighborsImpl(
       const torch::Tensor& seeds,
-      torch::optional<std::vector<int64_t>>& seed_offsets,
+      const torch::optional<std::vector<int64_t>>& seed_offsets,
       const std::vector<int64_t>& fanouts, NumPickFn num_pick_fn,
       PickFn pick_fn) const;
 
diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu
index 76fe26c6d8bd..e2afbe0049ce 100644
--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
@@ -124,6 +124,8 @@ __global__ void _ComputeRandoms(
   int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
   const int stride = gridDim.x * blockDim.x;
   const auto labor = indices != nullptr;
+  const float_t inf =
+      static_cast<float_t>(std::numeric_limits<float>::infinity());
 
   while (i < num_edges) {
     const auto row_position = csr_rows[i];
@@ -133,9 +135,8 @@ __global__ void _ComputeRandoms(
     const auto prob =
         sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
     const auto exp_rnd = -__logf(rnd);
-    const float_t adjusted_rnd = prob > 0
-                                     ? static_cast<float_t>(exp_rnd / prob)
-                                     : std::numeric_limits<float_t>::infinity();
+    const float_t adjusted_rnd =
+        prob > 0 ? static_cast<float_t>(exp_rnd / prob) : inf;
     random_arr[i] = adjusted_rnd;
     edge_ids[i] = row_offset;
 
@@ -208,8 +209,14 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
     torch::optional<torch::Tensor> node_type_offset,
     torch::optional<torch::Dict<std::string, int64_t>> node_type_to_id,
     torch::optional<torch::Dict<std::string, int64_t>> edge_type_to_id,
-    torch::optional<torch::Tensor> random_seed_tensor,
-    float seed2_contribution) {
+    torch::optional<torch::Tensor> random_seed_tensor, float seed2_contribution,
+    // Optional temporal sampling arguments begin.
+    torch::optional<torch::Tensor> seeds_timestamp,
+    torch::optional<torch::Tensor> seeds_pre_time_window,
+    torch::optional<torch::Tensor> node_timestamp,
+    torch::optional<torch::Tensor> edge_timestamp
+    // Optional temporal sampling arguments end.
+) {
   // When seed_offsets.has_value() in the hetero case, we compute the output of
   // sample_neighbors _convert_to_sampled_subgraph in a fused manner so that
   // _convert_to_sampled_subgraph only has to perform slices over the returned
@@ -238,6 +245,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, seeds);
   auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
   auto sliced_indptr = std::get<1>(in_degree_and_sliced_indptr);
+  const auto homo_in_degree = in_degree;
+  const auto homo_sliced_indptr = sliced_indptr;
   auto max_in_degree = torch::empty(
       1,
       c10::TensorOptions().dtype(in_degree.scalar_type()).pinned_memory(true));
@@ -287,6 +296,94 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   if (seeds.has_value() && !probs_or_mask.has_value() && fanouts.size() <= 1) {
     sub_indptr = ExclusiveCumSum(in_degree);
   }
+  torch::optional<torch::Tensor> homo_coo_rows;
+  if (seeds_timestamp.has_value()) {
+    // Temporal sampling is enabled.
+    const auto homo_sub_indptr =
+        fanouts.size() > 1 ? ExclusiveCumSum(homo_in_degree) : sub_indptr;
+    homo_coo_rows = ExpandIndptrImpl(
+        homo_sub_indptr, indices.scalar_type(), torch::nullopt, num_edges);
+    num_edges = homo_coo_rows->size(0);
+    const auto is_probs_initialized = sliced_probs_or_mask.has_value();
+    if (!is_probs_initialized) {
+      sliced_probs_or_mask =
+          torch::empty(*num_edges, sub_indptr.options().dtype(torch::kBool));
+    }
+    GRAPHBOLT_DISPATCH_ALL_TYPES(
+        sliced_probs_or_mask->scalar_type(),
+        "SampleNeighborsTemporalProbsOrMask", ([&] {
+          const scalar_t* input_probs_ptr =
+              is_probs_initialized ? sliced_probs_or_mask->data_ptr<scalar_t>()
+                                   : nullptr;
+          auto output_probs_ptr = sliced_probs_or_mask->data_ptr<scalar_t>();
+          using timestamp_t = int64_t;
+          const auto seeds_timestamp_ptr =
+              seeds_timestamp->data_ptr<timestamp_t>();
+          const timestamp_t* seeds_pre_time_window_ptr =
+              seeds_pre_time_window.has_value()
+                  ? seeds_pre_time_window->data_ptr<timestamp_t>()
+                  : nullptr;
+          const timestamp_t* node_timestamp_ptr =
+              node_timestamp.has_value()
+                  ? node_timestamp->data_ptr<timestamp_t>()
+                  : nullptr;
+          const timestamp_t* edge_timestamp_ptr =
+              edge_timestamp.has_value()
+                  ? edge_timestamp->data_ptr<timestamp_t>()
+                  : nullptr;
+          AT_DISPATCH_INDEX_TYPES(
+              homo_coo_rows->scalar_type(),
+              "SampleNeighborsTemporalMaskIndices", ([&] {
+                const auto coo_rows_ptr = homo_coo_rows->data_ptr<index_t>();
+                const auto indices_ptr = indices.data_ptr<index_t>();
+                AT_DISPATCH_INDEX_TYPES(
+                    homo_sliced_indptr.scalar_type(),
+                    "SampleNeighborsTemporalMaskIndptr", ([&] {
+                      const auto sliced_indptr_data =
+                          homo_sliced_indptr.data_ptr<index_t>();
+                      const auto sub_indptr_data =
+                          homo_sub_indptr.data_ptr<index_t>();
+                      CUB_CALL(
+                          DeviceFor::Bulk, *num_edges,
+                          [=] __device__(int64_t i) {
+                            const auto row = coo_rows_ptr[i];
+                            const auto seed_timestamp =
+                                seeds_timestamp_ptr[row];
+                            const auto row_offset = i - sub_indptr_data[row];
+                            const auto in_idx =
+                                sliced_indptr_data[row] + row_offset;
+                            bool mask = true;
+                            if (node_timestamp_ptr) {
+                              const auto index = indices_ptr[in_idx];
+                              const auto neighbor_timestamp =
+                                  node_timestamp_ptr[index];
+                              mask &= neighbor_timestamp < seed_timestamp;
+                              if (seeds_pre_time_window_ptr) {
+                                mask &= neighbor_timestamp >
+                                        seed_timestamp -
+                                            seeds_pre_time_window_ptr[row];
+                              }
+                            }
+                            if (edge_timestamp_ptr) {
+                              const auto edge_timestamp =
+                                  edge_timestamp_ptr[in_idx];
+                              mask &= edge_timestamp < seed_timestamp;
+                              if (seeds_pre_time_window_ptr) {
+                                mask &= edge_timestamp >
+                                        seed_timestamp -
+                                            seeds_pre_time_window_ptr[row];
+                              }
+                            }
+                            const scalar_t prob = input_probs_ptr
+                                                      ? input_probs_ptr[i]
+                                                      : scalar_t{1};
+                            output_probs_ptr[i] =
+                                prob * static_cast<scalar_t>(mask);
+                          });
+                    }));
+              }));
+        }));
+  }
   const continuous_seed random_seed = [&] {
     if (random_seed_tensor.has_value()) {
       return continuous_seed(random_seed_tensor.value(), seed2_contribution);
@@ -302,14 +399,14 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
   AT_DISPATCH_INDEX_TYPES(
       indptr.scalar_type(), "SampleNeighborsIndptr", ([&] {
         using indptr_t = index_t;
-        if (probs_or_mask.has_value()) {  // Count nonzero probs into in_degree.
+        if (sliced_probs_or_mask.has_value()) {
+          // Count nonzero probs into in_degree.
           GRAPHBOLT_DISPATCH_ALL_TYPES(
-              probs_or_mask.value().scalar_type(),
+              sliced_probs_or_mask->scalar_type(),
               "SampleNeighborsPositiveProbs", ([&] {
                 using probs_t = scalar_t;
                 auto is_nonzero = thrust::make_transform_iterator(
-                    sliced_probs_or_mask.value().data_ptr<probs_t>(),
-                    IsPositive{});
+                    sliced_probs_or_mask->data_ptr<probs_t>(), IsPositive{});
                 CUB_CALL(
                     DeviceSegmentedReduce::Sum, is_nonzero,
                     in_degree.data_ptr<indptr_t>(), num_rows,
@@ -333,9 +430,14 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
 
         // This operation is placed after num_sampled_edges copy is started to
         // hide the latency of copy synchronization later.
-        auto coo_rows = ExpandIndptrImpl(
-            sub_indptr, indices.scalar_type(), torch::nullopt, num_edges);
-        num_edges = coo_rows.size(0);
+        torch::Tensor coo_rows;
+        if (!homo_coo_rows.has_value() || fanouts.size() > 1) {
+          coo_rows = ExpandIndptrImpl(
+              sub_indptr, indices.scalar_type(), torch::nullopt, num_edges);
+          num_edges = coo_rows.size(0);
+        } else {
+          coo_rows = *homo_coo_rows;
+        }
 
         // Find the smallest integer type to store the edge id offsets. We synch
         // the CUDAEvent so that the access is safe.
@@ -343,7 +445,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
           max_in_degree_event.synchronize();
           return cuda::NumberOfBits(max_in_degree.data_ptr<indptr_t>()[0]);
         };
-        if (layer || probs_or_mask.has_value()) {
+        if (layer || sliced_probs_or_mask.has_value()) {
           const int num_bits = compute_num_bits();
           std::array<int, 4> type_bits = {8, 16, 32, 64};
           const auto type_index =
@@ -374,9 +476,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                     indices.scalar_type(), "SampleNeighborsIndices", ([&] {
                       using indices_t = index_t;
                       auto probs_or_mask_scalar_type = torch::kFloat32;
-                      if (probs_or_mask.has_value()) {
+                      if (sliced_probs_or_mask.has_value()) {
                         probs_or_mask_scalar_type =
-                            probs_or_mask.value().scalar_type();
+                            sliced_probs_or_mask->scalar_type();
                       }
                       GRAPHBOLT_DISPATCH_ALL_TYPES(
                           probs_or_mask_scalar_type, "SampleNeighborsProbs",
@@ -384,8 +486,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                             using probs_t = scalar_t;
                             probs_t* sliced_probs_ptr = nullptr;
                             if (sliced_probs_or_mask.has_value()) {
-                              sliced_probs_ptr = sliced_probs_or_mask.value()
-                                                     .data_ptr<probs_t>();
+                              sliced_probs_ptr =
+                                  sliced_probs_or_mask->data_ptr<probs_t>();
                             }
                             const indices_t* indices_ptr =
                                 layer ? indices.data_ptr<indices_t>() : nullptr;
diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index 479f42df5d0e..c5bfd6fd0de8 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -481,7 +481,7 @@ template <TemporalOption Temporal, typename NumPickFn, typename PickFn>
 c10::intrusive_ptr<FusedSampledSubgraph>
 FusedCSCSamplingGraph::SampleNeighborsImpl(
     const torch::Tensor& seeds,
-    torch::optional<std::vector<int64_t>>& seed_offsets,
+    const torch::optional<std::vector<int64_t>>& seed_offsets,
     const std::vector<int64_t>& fanouts, NumPickFn num_pick_fn,
     PickFn pick_fn) const {
   const int64_t num_seeds = seeds.size(0);
@@ -872,17 +872,56 @@ c10::intrusive_ptr<FusedSampledSubgraph> FusedCSCSamplingGraph::SampleNeighbors(
 
 c10::intrusive_ptr<FusedSampledSubgraph>
 FusedCSCSamplingGraph::TemporalSampleNeighbors(
-    const torch::Tensor& input_nodes,
-    const torch::Tensor& input_nodes_timestamp,
-    const std::vector<int64_t>& fanouts, bool replace, bool layer,
-    torch::optional<torch::Tensor> input_nodes_pre_time_window,
+    const torch::optional<torch::Tensor>& seeds,
+    const torch::optional<std::vector<int64_t>>& seed_offsets,
+    const torch::Tensor& seeds_timestamp, const std::vector<int64_t>& fanouts,
+    bool replace, bool layer, bool returning_indices_is_optional,
+    torch::optional<torch::Tensor> seeds_pre_time_window,
     torch::optional<torch::Tensor> probs_or_mask,
     torch::optional<std::string> node_timestamp_attr_name,
     torch::optional<std::string> edge_timestamp_attr_name,
     torch::optional<torch::Tensor> random_seed,
     double seed2_contribution) const {
-  torch::optional<std::vector<int64_t>> seed_offsets = torch::nullopt;
-  // 1. Get probs_or_mask.
+  // 1. Get the timestamp attribute for nodes of the graph
+  const auto node_timestamp = this->NodeAttribute(node_timestamp_attr_name);
+  // 2. Get the timestamp attribute for edges of the graph
+  const auto edge_timestamp = this->EdgeAttribute(edge_timestamp_attr_name);
+  // If seeds does not have a value, then we expect all arguments to be resident
+  // on the GPU. If seeds has a value, then we expect them to be accessible from
+  // GPU. This is required for the dispatch to work when CUDA is not available.
+  if (((!seeds.has_value() && utils::is_on_gpu(indptr_) &&
+        utils::is_on_gpu(indices_) &&
+        (!probs_or_mask.has_value() ||
+         utils::is_on_gpu(probs_or_mask.value())) &&
+        (!type_per_edge_.has_value() ||
+         utils::is_on_gpu(type_per_edge_.value()))) ||
+       (seeds.has_value() && utils::is_on_gpu(seeds.value()) &&
+        utils::is_accessible_from_gpu(indptr_) &&
+        utils::is_accessible_from_gpu(indices_) &&
+        (!probs_or_mask.has_value() ||
+         utils::is_accessible_from_gpu(probs_or_mask.value())) &&
+        (!type_per_edge_.has_value() ||
+         utils::is_accessible_from_gpu(type_per_edge_.value())))) &&
+      utils::is_accessible_from_gpu(seeds_timestamp) &&
+      (!seeds_pre_time_window.has_value() ||
+       utils::is_accessible_from_gpu(*seeds_pre_time_window)) &&
+      (!node_timestamp.has_value() ||
+       utils::is_accessible_from_gpu(*node_timestamp)) &&
+      (!edge_timestamp.has_value() ||
+       utils::is_accessible_from_gpu(*edge_timestamp)) &&
+      !replace) {
+    GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
+        c10::DeviceType::CUDA, "SampleNeighbors", {
+          return ops::SampleNeighbors(
+              indptr_, indices_, seeds, seed_offsets, fanouts, replace, layer,
+              returning_indices_is_optional, type_per_edge_, probs_or_mask,
+              node_type_offset_, node_type_to_id_, edge_type_to_id_,
+              random_seed, seed2_contribution, seeds_timestamp,
+              seeds_pre_time_window, node_timestamp, edge_timestamp);
+        });
+  }
+  TORCH_CHECK(seeds.has_value(), "Nodes can not be None for CPU.");
+  // 3. Get probs_or_mask.
   if (probs_or_mask.has_value()) {
     // Note probs will be passed as input for 'torch.multinomial' in deeper
     // stack, which doesn't support 'torch.half' and 'torch.bool' data types. To
@@ -892,10 +931,6 @@ FusedCSCSamplingGraph::TemporalSampleNeighbors(
       probs_or_mask = probs_or_mask.value().to(torch::kFloat32);
     }
   }
-  // 2. Get the timestamp attribute for nodes of the graph
-  auto node_timestamp = this->NodeAttribute(node_timestamp_attr_name);
-  // 3. Get the timestamp attribute for edges of the graph
-  auto edge_timestamp = this->EdgeAttribute(edge_timestamp_attr_name);
   // 4. Call SampleNeighborsImpl
   if (layer) {
     if (random_seed.has_value() && random_seed->numel() >= 2) {
@@ -904,15 +939,15 @@ FusedCSCSamplingGraph::TemporalSampleNeighbors(
           {random_seed.value(), static_cast<float>(seed2_contribution)},
           NumNodes()};
       return SampleNeighborsImpl<TemporalOption::TEMPORAL>(
-          input_nodes, seed_offsets, fanouts,
+          *seeds, seed_offsets, fanouts,
           GetTemporalNumPickFn(
-              input_nodes_timestamp, indices_, fanouts, replace, type_per_edge_,
-              input_nodes_pre_time_window, probs_or_mask, node_timestamp,
+              seeds_timestamp, indices_, fanouts, replace, type_per_edge_,
+              seeds_pre_time_window, probs_or_mask, node_timestamp,
               edge_timestamp),
           GetTemporalPickFn(
-              input_nodes_timestamp, indices_, fanouts, replace,
-              indptr_.options(), type_per_edge_, input_nodes_pre_time_window,
-              probs_or_mask, node_timestamp, edge_timestamp, args));
+              seeds_timestamp, indices_, fanouts, replace, indptr_.options(),
+              type_per_edge_, seeds_pre_time_window, probs_or_mask,
+              node_timestamp, edge_timestamp, args));
     } else {
       auto args = [&] {
         if (random_seed.has_value() && random_seed->numel() == 1) {
@@ -927,27 +962,27 @@ FusedCSCSamplingGraph::TemporalSampleNeighbors(
         }
       }();
       return SampleNeighborsImpl<TemporalOption::TEMPORAL>(
-          input_nodes, seed_offsets, fanouts,
+          *seeds, seed_offsets, fanouts,
           GetTemporalNumPickFn(
-              input_nodes_timestamp, indices_, fanouts, replace, type_per_edge_,
-              input_nodes_pre_time_window, probs_or_mask, node_timestamp,
+              seeds_timestamp, indices_, fanouts, replace, type_per_edge_,
+              seeds_pre_time_window, probs_or_mask, node_timestamp,
               edge_timestamp),
           GetTemporalPickFn(
-              input_nodes_timestamp, indices_, fanouts, replace,
-              indptr_.options(), type_per_edge_, input_nodes_pre_time_window,
-              probs_or_mask, node_timestamp, edge_timestamp, args));
+              seeds_timestamp, indices_, fanouts, replace, indptr_.options(),
+              type_per_edge_, seeds_pre_time_window, probs_or_mask,
+              node_timestamp, edge_timestamp, args));
     }
   } else {
     SamplerArgs<SamplerType::NEIGHBOR> args;
     return SampleNeighborsImpl<TemporalOption::TEMPORAL>(
-        input_nodes, seed_offsets, fanouts,
+        *seeds, seed_offsets, fanouts,
         GetTemporalNumPickFn(
-            input_nodes_timestamp, this->indices_, fanouts, replace,
-            type_per_edge_, input_nodes_pre_time_window, probs_or_mask,
-            node_timestamp, edge_timestamp),
+            seeds_timestamp, this->indices_, fanouts, replace, type_per_edge_,
+            seeds_pre_time_window, probs_or_mask, node_timestamp,
+            edge_timestamp),
         GetTemporalPickFn(
-            input_nodes_timestamp, this->indices_, fanouts, replace,
-            indptr_.options(), type_per_edge_, input_nodes_pre_time_window,
+            seeds_timestamp, this->indices_, fanouts, replace,
+            indptr_.options(), type_per_edge_, seeds_pre_time_window,
             probs_or_mask, node_timestamp, edge_timestamp, args));
   }
 }
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index f311a288787a..7288c969d0a7 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -557,6 +557,7 @@ def _convert_to_homogeneous_nodes(
             )
             return (
                 torch.cat(homogeneous_nodes),
+                homogeneous_node_offsets,
                 torch.cat(homogeneous_timestamps),
                 homogeneous_time_windows,
             )
@@ -1040,11 +1041,11 @@ def sample_layer_neighbors(
 
     def temporal_sample_neighbors(
         self,
-        nodes: Union[torch.Tensor, Dict[str, torch.Tensor]],
-        input_nodes_timestamp: Union[torch.Tensor, Dict[str, torch.Tensor]],
+        seeds: Union[torch.Tensor, Dict[str, torch.Tensor]],
+        seeds_timestamp: Union[torch.Tensor, Dict[str, torch.Tensor]],
         fanouts: torch.Tensor,
         replace: bool = False,
-        input_nodes_pre_time_window: Optional[
+        seeds_pre_time_window: Optional[
             Union[torch.Tensor, Dict[str, torch.Tensor]]
         ] = None,
         probs_name: Optional[str] = None,
@@ -1055,14 +1056,14 @@ def temporal_sample_neighbors(
         subgraph.
 
         If `node_timestamp_attr_name` or `edge_timestamp_attr_name` is given,
-        the sampled neighbor or edge of an input node must have a timestamp
-        that is smaller than that of the input node.
+        the sampled neighbor or edge of an seed node must have a timestamp
+        that is smaller than that of the seed node.
 
         Parameters
         ----------
-        nodes: torch.Tensor
+        seeds: torch.Tensor
             IDs of the given seed nodes.
-        input_nodes_timestamp: torch.Tensor
+        seeds_timestamp: torch.Tensor
             Timestamps of the given seed nodes.
         fanouts: torch.Tensor
             The number of edges to be sampled for each node with or without
@@ -1085,12 +1086,11 @@ def temporal_sample_neighbors(
             Boolean indicating whether the sample is preformed with or
             without replacement. If True, a value can be selected multiple
             times. Otherwise, each value can be selected only once.
-        input_nodes_pre_time_window: torch.Tensor
+        seeds_pre_time_window: torch.Tensor
             The time window of the nodes represents a period of time before
-            `input_nodes_timestamp`. If provided, only neighbors and related
-            edges whose timestamps fall within `[input_nodes_timestamp -
-            input_nodes_pre_time_window, input_nodes_timestamp]` will be
-            filtered.
+            `seeds_timestamp`. If provided, only neighbors and related
+            edges whose timestamps fall within `[seeds_timestamp -
+            seeds_pre_time_window, seeds_timestamp]` will be filtered.
         probs_name: str, optional
             An optional string specifying the name of an edge attribute. This
             attribute tensor should contain (unnormalized) probabilities
@@ -1107,40 +1107,48 @@ def temporal_sample_neighbors(
         SampledSubgraphImpl
             The sampled subgraph.
         """
-        if isinstance(nodes, dict):
+        seed_offsets = None
+        if isinstance(seeds, dict):
             (
-                nodes,
-                input_nodes_timestamp,
-                input_nodes_pre_time_window,
+                seeds,
+                seed_offsets,
+                seeds_timestamp,
+                seeds_pre_time_window,
             ) = self._convert_to_homogeneous_nodes(
-                nodes, input_nodes_timestamp, input_nodes_pre_time_window
+                seeds, seeds_timestamp, seeds_pre_time_window
             )
+        elif seeds is None:
+            seed_offsets = self._indptr_node_type_offset_list
 
         # Ensure nodes is 1-D tensor.
         probs_or_mask = self.edge_attributes[probs_name] if probs_name else None
-        self._check_sampler_arguments(nodes, fanouts, probs_or_mask)
+        self._check_sampler_arguments(seeds, fanouts, probs_or_mask)
         C_sampled_subgraph = self._c_csc_graph.temporal_sample_neighbors(
-            nodes,
-            input_nodes_timestamp,
+            seeds,
+            seed_offsets,
+            seeds_timestamp,
             fanouts.tolist(),
             replace,
             False,  # is_labor
-            input_nodes_pre_time_window,
+            False,  # returning_indices_is_optional
+            seeds_pre_time_window,
             probs_or_mask,
             node_timestamp_attr_name,
             edge_timestamp_attr_name,
             None,  # random_seed, labor parameter
             0,  # seed2_contribution, labor_parameter
         )
-        return self._convert_to_sampled_subgraph(C_sampled_subgraph)
+        return self._convert_to_sampled_subgraph(
+            C_sampled_subgraph, seed_offsets
+        )
 
     def temporal_sample_layer_neighbors(
         self,
-        nodes: Union[torch.Tensor, Dict[str, torch.Tensor]],
-        input_nodes_timestamp: Union[torch.Tensor, Dict[str, torch.Tensor]],
+        seeds: Union[torch.Tensor, Dict[str, torch.Tensor]],
+        seeds_timestamp: Union[torch.Tensor, Dict[str, torch.Tensor]],
         fanouts: torch.Tensor,
         replace: bool = False,
-        input_nodes_pre_time_window: Optional[
+        seeds_pre_time_window: Optional[
             Union[torch.Tensor, Dict[str, torch.Tensor]]
         ] = None,
         probs_name: Optional[str] = None,
@@ -1155,14 +1163,14 @@ def temporal_sample_layer_neighbors(
         <https://proceedings.neurips.cc/paper_files/paper/2023/file/51f9036d5e7ae822da8f6d4adda1fb39-Paper-Conference.pdf>`__
 
         If `node_timestamp_attr_name` or `edge_timestamp_attr_name` is given,
-        the sampled neighbor or edge of an input node must have a timestamp
-        that is smaller than that of the input node.
+        the sampled neighbor or edge of an seed node must have a timestamp
+        that is smaller than that of the seed node.
 
         Parameters
         ----------
-        nodes: torch.Tensor
+        seeds: torch.Tensor
             IDs of the given seed nodes.
-        input_nodes_timestamp: torch.Tensor
+        seeds_timestamp: torch.Tensor
             Timestamps of the given seed nodes.
         fanouts: torch.Tensor
             The number of edges to be sampled for each node with or without
@@ -1185,11 +1193,11 @@ def temporal_sample_layer_neighbors(
             Boolean indicating whether the sample is preformed with or
             without replacement. If True, a value can be selected multiple
             times. Otherwise, each value can be selected only once.
-        input_nodes_pre_time_window: torch.Tensor
+        seeds_pre_time_window: torch.Tensor
             The time window of the nodes represents a period of time before
-            `input_nodes_timestamp`. If provided, only neighbors and related
-            edges whose timestamps fall within `[input_nodes_timestamp -
-            input_nodes_pre_time_window, input_nodes_timestamp]` will be
+            `seeds_timestamp`. If provided, only neighbors and related
+            edges whose timestamps fall within `[seeds_timestamp -
+            seeds_pre_time_window, seeds_timestamp]` will be
             filtered.
         probs_name: str, optional
             An optional string specifying the name of an edge attribute. This
@@ -1233,32 +1241,40 @@ def temporal_sample_layer_neighbors(
         SampledSubgraphImpl
             The sampled subgraph.
         """
-        if isinstance(nodes, dict):
+        seed_offsets = None
+        if isinstance(seeds, dict):
             (
-                nodes,
-                input_nodes_timestamp,
-                input_nodes_pre_time_window,
+                seeds,
+                seed_offsets,
+                seeds_timestamp,
+                seeds_pre_time_window,
             ) = self._convert_to_homogeneous_nodes(
-                nodes, input_nodes_timestamp, input_nodes_pre_time_window
+                seeds, seeds_timestamp, seeds_pre_time_window
             )
+        elif seeds is None:
+            seed_offsets = self._indptr_node_type_offset_list
 
         # Ensure nodes is 1-D tensor.
         probs_or_mask = self.edge_attributes[probs_name] if probs_name else None
-        self._check_sampler_arguments(nodes, fanouts, probs_or_mask)
+        self._check_sampler_arguments(seeds, fanouts, probs_or_mask)
         C_sampled_subgraph = self._c_csc_graph.temporal_sample_neighbors(
-            nodes,
-            input_nodes_timestamp,
+            seeds,
+            seed_offsets,
+            seeds_timestamp,
             fanouts.tolist(),
             replace,
             True,  # is_labor
-            input_nodes_pre_time_window,
+            False,  # returning_indices_is_optional
+            seeds_pre_time_window,
             probs_or_mask,
             node_timestamp_attr_name,
             edge_timestamp_attr_name,
             random_seed,
             seed2_contribution,
         )
-        return self._convert_to_sampled_subgraph(C_sampled_subgraph)
+        return self._convert_to_sampled_subgraph(
+            C_sampled_subgraph, seed_offsets
+        )
 
     def sample_negative_edges_uniform(
         self, edge_type, node_pairs, negative_ratio
diff --git a/python/dgl/graphbolt/impl/temporal_neighbor_sampler.py b/python/dgl/graphbolt/impl/temporal_neighbor_sampler.py
index 97cc677fd400..80a5fb90ac5d 100644
--- a/python/dgl/graphbolt/impl/temporal_neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/temporal_neighbor_sampler.py
@@ -61,15 +61,16 @@ def sample_subgraphs(
                 )
                 for ntype in ntypes
             }
+            empty_tensor = torch.tensor(
+                [], dtype=torch.int64, device=first_val.device
+            )
             seeds_timestamp = {
-                ntype: seeds_timestamp.get(ntype, torch.LongTensor([]))
+                ntype: seeds_timestamp.get(ntype, empty_tensor)
                 for ntype in ntypes
             }
             if seeds_pre_time_window:
                 seeds_pre_time_window = {
-                    ntype: seeds_pre_time_window.get(
-                        ntype, torch.LongTensor([])
-                    )
+                    ntype: seeds_pre_time_window.get(ntype, empty_tensor)
                     for ntype in ntypes
                 }
         for hop in range(num_layers):
diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py
index 97aaa411c388..aaeb4a3e8312 100644
--- a/python/dgl/graphbolt/internal/sample_utils.py
+++ b/python/dgl/graphbolt/internal/sample_utils.py
@@ -1,6 +1,5 @@
 """Utility functions for sampling."""
 
-import copy
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -386,9 +385,11 @@ def compact_csc_format(
     else:
         compacted_csc_formats = {}
         src_timestamps = None
-        original_row_ids = copy.deepcopy(dst_nodes)
+        original_row_ids = {key: val.clone() for key, val in dst_nodes.items()}
         if has_timestamp:
-            src_timestamps = copy.deepcopy(dst_timestamps)
+            src_timestamps = {
+                key: val.clone() for key, val in dst_timestamps.items()
+            }
         for etype, csc_format in csc_formats.items():
             src_type, _, dst_type = etype_str_to_tuple(etype)
             assert len(dst_nodes.get(dst_type, [])) + 1 == len(
@@ -429,7 +430,9 @@ def compact_csc_format(
                         src_timestamps.get(
                             src_type,
                             torch.tensor(
-                                [], dtype=dst_timestamps[dst_type].dtype
+                                [],
+                                dtype=dst_timestamps[dst_type].dtype,
+                                device=device,
                             ),
                         ),
                         _broadcast_timestamps(

From 90c26be2689f8f4a4571462af8bc83a1043d2d56 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 9 Aug 2024 08:52:02 -0400
Subject: [PATCH 08/78] [GraphBolt][CUDA][Temporal] Tests and example
 enablement. (#7678)

---
 .../graphbolt/temporal_link_prediction.py     |  5 +-
 .../impl/test_fused_csc_sampling_graph.py     | 39 +++++-----
 .../graphbolt/test_subgraph_sampler.py        | 74 +++++++++----------
 3 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/examples/graphbolt/temporal_link_prediction.py b/examples/graphbolt/temporal_link_prediction.py
index 25851100019c..d1b97d4ab6aa 100644
--- a/examples/graphbolt/temporal_link_prediction.py
+++ b/examples/graphbolt/temporal_link_prediction.py
@@ -121,6 +121,9 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
         shuffle=is_train,
     )
 
+    if args.storage_device != "cpu":
+        datapipe = datapipe.copy_to(device=args.device)
+
     ############################################################################
     # [Input]:
     # 'datapipe' is either 'ItemSampler' or 'UniformNegativeSampler' depending
@@ -250,7 +253,7 @@ def parse_args():
     parser.add_argument(
         "--mode",
         default="cpu-cuda",
-        choices=["cpu-cpu", "cpu-cuda"],
+        choices=["cpu-cpu", "cpu-cuda", "cuda-cuda"],
         help="Dataset storage placement and Train device: 'cpu' for CPU and RAM,"
         " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
     )
diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
index ffedcfe24538..fca1dbfdbcbc 100644
--- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
@@ -830,10 +830,6 @@ def test_in_subgraph_hetero():
     )
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Graph is CPU only at present.",
-)
 @pytest.mark.parametrize("indptr_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("indices_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("replace", [False, True])
@@ -848,6 +844,8 @@ def test_temporal_sample_neighbors_homo(
     use_node_timestamp,
     use_edge_timestamp,
 ):
+    if replace and F._default_context_str == "gpu":
+        pytest.skip("Sampling with replacement not yet implemented on the GPU.")
     """Original graph in COO:
     1   0   1   0   1
     1   0   1   1   0
@@ -867,7 +865,7 @@ def test_temporal_sample_neighbors_homo(
     assert len(indptr) == total_num_nodes + 1
 
     # Construct FusedCSCSamplingGraph.
-    graph = gb.fused_csc_sampling_graph(indptr, indices)
+    graph = gb.fused_csc_sampling_graph(indptr, indices).to(F.ctx())
 
     # Generate subgraph via sample neighbors.
     fanouts = torch.LongTensor([2])
@@ -878,15 +876,17 @@ def test_temporal_sample_neighbors_homo(
     )
 
     seed_list = [1, 3, 4]
-    seed_timestamp = torch.randint(0, 100, (len(seed_list),), dtype=torch.int64)
+    seed_timestamp = torch.randint(
+        0, 100, (len(seed_list),), dtype=torch.int64, device=F.ctx()
+    )
     if use_node_timestamp:
         node_timestamp = torch.randint(
-            0, 100, (total_num_nodes,), dtype=torch.int64
+            0, 100, (total_num_nodes,), dtype=torch.int64, device=F.ctx()
         )
         graph.node_attributes = {"timestamp": node_timestamp}
     if use_edge_timestamp:
         edge_timestamp = torch.randint(
-            0, 100, (total_num_edges,), dtype=torch.int64
+            0, 100, (total_num_edges,), dtype=torch.int64, device=F.ctx()
         )
         graph.edge_attributes = {"timestamp": edge_timestamp}
 
@@ -936,7 +936,7 @@ def _get_available_neighbors():
             available_neighbors.append(neighbors)
         return available_neighbors
 
-    nodes = torch.tensor(seed_list, dtype=indices_dtype)
+    nodes = torch.tensor(seed_list, dtype=indices_dtype, device=F.ctx())
     subgraph = sampler(
         nodes,
         seed_timestamp,
@@ -947,6 +947,7 @@ def _get_available_neighbors():
     )
     sampled_count = torch.diff(subgraph.sampled_csc.indptr).tolist()
     available_neighbors = _get_available_neighbors()
+    assert len(available_neighbors) == len(sampled_count)
     for i, count in enumerate(sampled_count):
         if not replace:
             expect_count = min(fanouts[0], len(available_neighbors[i]))
@@ -958,10 +959,6 @@ def _get_available_neighbors():
         assert set(neighbors.tolist()).issubset(set(available_neighbors[i]))
 
 
-@unittest.skipIf(
-    F._default_context_str == "gpu",
-    reason="Graph is CPU only at present.",
-)
 @pytest.mark.parametrize("indptr_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("indices_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("replace", [False, True])
@@ -976,6 +973,8 @@ def test_temporal_sample_neighbors_hetero(
     use_node_timestamp,
     use_edge_timestamp,
 ):
+    if replace and F._default_context_str == "gpu":
+        pytest.skip("Sampling with replacement not yet implemented on the GPU.")
     """Original graph in COO:
     "n1:e1:n2":[0, 0, 1, 1, 1], [0, 2, 0, 1, 2]
     "n2:e2:n1":[0, 0, 1, 2], [0, 1, 1 ,0]
@@ -1006,7 +1005,7 @@ def test_temporal_sample_neighbors_hetero(
         type_per_edge=type_per_edge,
         node_type_to_id=ntypes,
         edge_type_to_id=etypes,
-    )
+    ).to(F.ctx())
 
     # Generate subgraph via sample neighbors.
     fanouts = torch.LongTensor([-1, -1])
@@ -1017,8 +1016,8 @@ def test_temporal_sample_neighbors_hetero(
     )
 
     seeds = {
-        "n1": torch.tensor([0], dtype=indices_dtype),
-        "n2": torch.tensor([0], dtype=indices_dtype),
+        "n1": torch.tensor([0], dtype=indices_dtype, device=F.ctx()),
+        "n2": torch.tensor([0], dtype=indices_dtype, device=F.ctx()),
     }
     per_etype_destination_nodes = {
         "n1:e1:n2": torch.tensor([1], dtype=indices_dtype),
@@ -1026,17 +1025,17 @@ def test_temporal_sample_neighbors_hetero(
     }
 
     seed_timestamp = {
-        "n1": torch.randint(0, 100, (1,), dtype=torch.int64),
-        "n2": torch.randint(0, 100, (1,), dtype=torch.int64),
+        "n1": torch.randint(0, 100, (1,), dtype=torch.int64, device=F.ctx()),
+        "n2": torch.randint(0, 100, (1,), dtype=torch.int64, device=F.ctx()),
     }
     if use_node_timestamp:
         node_timestamp = torch.randint(
-            0, 100, (total_num_nodes,), dtype=torch.int64
+            0, 100, (total_num_nodes,), dtype=torch.int64, device=F.ctx()
         )
         graph.node_attributes = {"timestamp": node_timestamp}
     if use_edge_timestamp:
         edge_timestamp = torch.randint(
-            0, 100, (total_num_edges,), dtype=torch.int64
+            0, 100, (total_num_edges,), dtype=torch.int64, device=F.ctx()
         )
         graph.edge_attributes = {"timestamp": edge_timestamp}
 
diff --git a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
index 650903d561a5..202a05e3fbfa 100644
--- a/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
+++ b/tests/python/pytorch/graphbolt/test_subgraph_sampler.py
@@ -14,14 +14,6 @@
 from . import gb_test_utils
 
 
-# Skip all tests on GPU when sampling with TemporalNeighborSampler.
-def _check_sampler_type(sampler_type):
-    if F._default_context_str != "cpu" and _is_temporal(sampler_type):
-        pytest.skip(
-            "TemporalNeighborSampler sampling tests are only supported on CPU."
-        )
-
-
 def _check_sampler_len(sampler, lenExp):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=UserWarning)
@@ -199,7 +191,6 @@ def test_NeighborSampler_fanouts(labor):
     ],
 )
 def test_SubgraphSampler_Node(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -231,7 +222,6 @@ def test_SubgraphSampler_Node(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -268,7 +258,6 @@ def test_SubgraphSampler_Link(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link_With_Negative(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -302,7 +291,6 @@ def test_SubgraphSampler_Link_With_Negative(sampler_type):
     ],
 )
 def test_SubgraphSampler_HyperLink(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True).to(
         F.ctx()
     )
@@ -339,7 +327,6 @@ def test_SubgraphSampler_HyperLink(sampler_type):
     ],
 )
 def test_SubgraphSampler_Node_Hetero(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(3)
     names = "seeds"
@@ -375,7 +362,6 @@ def test_SubgraphSampler_Node_Hetero(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link_Hetero(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
     first_names = "seeds"
@@ -435,7 +421,6 @@ def test_SubgraphSampler_Link_Hetero(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link_Hetero_With_Negative(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
     first_names = "seeds"
@@ -485,7 +470,6 @@ def test_SubgraphSampler_Link_Hetero_With_Negative(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link_Hetero_Unknown_Etype(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
     first_names = "seeds"
@@ -535,7 +519,6 @@ def test_SubgraphSampler_Link_Hetero_Unknown_Etype(sampler_type):
     ],
 )
 def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     first_items = torch.LongTensor([[0, 0, 1, 1], [0, 2, 0, 1]]).T
     first_names = "seeds"
@@ -586,7 +569,6 @@ def test_SubgraphSampler_Link_Hetero_With_Negative_Unknown_Etype(sampler_type):
     ],
 )
 def test_SubgraphSampler_HyperLink_Hetero(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.LongTensor([[2, 0, 1, 1, 2], [0, 1, 1, 0, 0]])
     names = "seeds"
@@ -646,7 +628,6 @@ def test_SubgraphSampler_HyperLink_Hetero(sampler_type):
     [False, True],
 )
 def test_SubgraphSampler_Random_Hetero_Graph(sampler_type, replace):
-    _check_sampler_type(sampler_type)
     if F._default_context_str == "gpu" and replace == True:
         pytest.skip("Sampling with replacement not yet supported on GPU.")
     num_nodes = 5
@@ -748,7 +729,6 @@ def test_SubgraphSampler_Random_Hetero_Graph(sampler_type, replace):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Homo_Node(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = dgl.graph(
         ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
     )
@@ -758,10 +738,14 @@ def test_SubgraphSampler_without_deduplication_Homo_Node(sampler_type):
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64
+            ).to(F.ctx())
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64
+            ).to(F.ctx())
         }
         items = (items, torch.randint(1, 10, (3,)))
         names = (names, "timestamp")
@@ -822,16 +806,19 @@ def test_SubgraphSampler_without_deduplication_Homo_Node(sampler_type):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Hetero_Node(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(2)
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64, device=F.ctx()
+            )
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64, device=F.ctx()
+            )
         }
         items = (items, torch.randint(1, 10, (2,)))
         names = (names, "timestamp")
@@ -1084,7 +1071,6 @@ def test_SubgraphSampler_unique_csc_format_Hetero_Node(labor):
     ],
 )
 def test_SubgraphSampler_Hetero_multifanout_per_layer(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items_n1 = torch.tensor([0])
     items_n2 = torch.tensor([1])
@@ -1160,7 +1146,6 @@ def test_SubgraphSampler_Hetero_multifanout_per_layer(sampler_type):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Homo_Link(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = dgl.graph(
         ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
     )
@@ -1170,10 +1155,14 @@ def test_SubgraphSampler_without_deduplication_Homo_Link(sampler_type):
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64
+            ).to(F.ctx())
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64
+            ).to(F.ctx())
         }
         items = (items, torch.randint(1, 10, (2,)))
         names = (names, "timestamp")
@@ -1227,16 +1216,19 @@ def test_SubgraphSampler_without_deduplication_Homo_Link(sampler_type):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Hetero_Link(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(2).view(1, 2)
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64
+            ).to(F.ctx())
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64
+            ).to(F.ctx())
         }
         items = (items, torch.randint(1, 10, (1,)))
         names = (names, "timestamp")
@@ -1542,7 +1534,6 @@ def test_SubgraphSampler_unique_csc_format_Hetero_Link(labor):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Homo_HyperLink(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = dgl.graph(
         ([5, 0, 1, 5, 6, 7, 2, 2, 4], [0, 1, 2, 2, 2, 2, 3, 4, 4])
     )
@@ -1551,10 +1542,14 @@ def test_SubgraphSampler_without_deduplication_Homo_HyperLink(sampler_type):
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64
+            ).to(F.ctx())
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64
+            ).to(F.ctx())
         }
         items = (items, torch.randint(1, 10, (2,)))
         names = (names, "timestamp")
@@ -1608,16 +1603,19 @@ def test_SubgraphSampler_without_deduplication_Homo_HyperLink(sampler_type):
     ],
 )
 def test_SubgraphSampler_without_deduplication_Hetero_HyperLink(sampler_type):
-    _check_sampler_type(sampler_type)
     graph = get_hetero_graph().to(F.ctx())
     items = torch.arange(3).view(1, 3)
     names = "seeds"
     if _is_temporal(sampler_type):
         graph.node_attributes = {
-            "timestamp": torch.zeros(graph.csc_indptr.numel() - 1).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.csc_indptr.numel() - 1, dtype=torch.int64
+            ).to(F.ctx())
         }
         graph.edge_attributes = {
-            "timestamp": torch.zeros(graph.indices.numel()).to(F.ctx())
+            "timestamp": torch.zeros(
+                graph.indices.numel(), dtype=torch.int64
+            ).to(F.ctx())
         }
         items = (items, torch.randint(1, 10, (1,)))
         names = (names, "timestamp")

From c86776d898e9f58e3e6182adf2aee9ff6f82d559 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 10 Aug 2024 13:43:02 -0400
Subject: [PATCH 09/78] [GraphBolt][io_uring] Use RAII to ensure queues are
 returned. (#7680)

---
 graphbolt/src/cnumpy.cc | 39 +++++--------------------
 graphbolt/src/cnumpy.h  | 65 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/graphbolt/src/cnumpy.cc b/graphbolt/src/cnumpy.cc
index 232825625dcb..98e10ad8e71c 100644
--- a/graphbolt/src/cnumpy.cc
+++ b/graphbolt/src/cnumpy.cc
@@ -171,27 +171,19 @@ torch::Tensor OnDiskNpyArray::IndexSelectIOUringImpl(torch::Tensor index) {
   // Indicator for index error.
   std::atomic<int> error_flag{};
   std::atomic<int64_t> work_queue{};
-  std::atomic_flag exiting_first = ATOMIC_FLAG_INIT;
-  // Consume a slot so that parallel_for is called only if there are available
-  // queues.
-  semaphore_.acquire();
-  std::atomic<int> num_semaphore_acquisitions = 1;
-  graphbolt::parallel_for_each_interop(0, num_thread_, 1, [&](int thread_id) {
+  // Construct a QueueAndBufferAcquirer object so that the worker threads can
+  // share the available queues and buffers.
+  QueueAndBufferAcquirer queue_source(this);
+  graphbolt::parallel_for_each_interop(0, num_thread_, 1, [&](int) {
     // The completion queue might contain 4 * kGroupSize while we may submit
     // 4 * kGroupSize more. No harm in overallocation here.
     CircularQueue<ReadRequest> read_queue(8 * kGroupSize);
     int64_t num_submitted = 0;
     int64_t num_completed = 0;
-    {
-      // We consume a slot from the semaphore to use a queue.
-      semaphore_.acquire();
-      num_semaphore_acquisitions.fetch_add(1, std::memory_order_relaxed);
-      std::lock_guard lock(available_queues_mtx_);
-      TORCH_CHECK(!available_queues_.empty());
-      thread_id = available_queues_.back();
-      available_queues_.pop_back();
-    }
-    auto &io_uring_queue = io_uring_queue_[thread_id];
+    auto [acquired_queue_handle, my_read_buffer2] = queue_source.get();
+    auto &io_uring_queue = acquired_queue_handle.get();
+    // Capturing structured binding is available only in C++20, so we rename.
+    auto my_read_buffer = my_read_buffer2;
     auto submit_fn = [&](int64_t submission_minimum_batch_size) {
       if (read_queue.Size() < submission_minimum_batch_size) return;
       TORCH_CHECK(  // Check for sqe overflow.
@@ -207,7 +199,6 @@ torch::Tensor OnDiskNpyArray::IndexSelectIOUringImpl(torch::Tensor index) {
         read_queue.PopN(submitted);
       }
     };
-    auto my_read_buffer = ReadBuffer(thread_id);
     for (int64_t read_buffer_slot = 0; true;) {
       auto request_read_buffer = [&]() {
         return my_read_buffer + (aligned_length_ + block_size_) *
@@ -307,21 +298,7 @@ torch::Tensor OnDiskNpyArray::IndexSelectIOUringImpl(torch::Tensor index) {
       io_uring_cq_advance(&io_uring_queue, num_cqes_seen);
       num_completed += num_cqes_seen;
     }
-    {
-      // We give back the slot we used.
-      std::lock_guard lock(available_queues_mtx_);
-      available_queues_.push_back(thread_id);
-    }
-    // If this is the first thread exiting, release the master thread's ticket
-    // as well by releasing 2 slots. Otherwise, release 1 slot.
-    const auto releasing = exiting_first.test_and_set() ? 1 : 2;
-    semaphore_.release(releasing);
-    num_semaphore_acquisitions.fetch_add(-releasing, std::memory_order_relaxed);
   });
-  // If any of the worker threads exit early without being able to release the
-  // semaphore, we make sure to release it for them in the main thread.
-  semaphore_.release(
-      num_semaphore_acquisitions.load(std::memory_order_relaxed));
   const auto ret_val = error_flag.load(std::memory_order_relaxed);
   switch (ret_val) {
     case 0:  // Successful.
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index 48ac98ebc94c..3f7bd4a99401 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -18,11 +18,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <cuda/std/semaphore>
-#include <fstream>
-#include <iostream>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace graphbolt {
@@ -147,6 +146,68 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
   static inline std::mutex available_queues_mtx_;  // available_queues_ mutex.
   static inline std::vector<int> available_queues_;
 
+  struct QueueAndBufferAcquirer {
+    struct UniqueQueue {
+      UniqueQueue(QueueAndBufferAcquirer* acquirer, int thread_id)
+          : acquirer_(acquirer), thread_id_(thread_id) {}
+      UniqueQueue(const UniqueQueue&) = delete;
+      UniqueQueue& operator=(const UniqueQueue&) = delete;
+
+      ~UniqueQueue() {
+        {
+          // We give back the slot we used.
+          std::lock_guard lock(available_queues_mtx_);
+          available_queues_.push_back(thread_id_);
+        }
+        // If this is the first thread exiting, release the master thread's
+        // ticket as well by releasing 2 slots. Otherwise, release 1 slot.
+        const auto releasing = acquirer_->exiting_first_.test_and_set() ? 1 : 2;
+        semaphore_.release(releasing);
+        acquirer_->num_acquisitions_.fetch_add(
+            -releasing, std::memory_order_relaxed);
+      }
+
+      ::io_uring& get() const { return io_uring_queue_[thread_id_]; }
+
+     private:
+      QueueAndBufferAcquirer* acquirer_;
+      int thread_id_;
+    };
+
+    QueueAndBufferAcquirer(OnDiskNpyArray* array) : array_(array) {
+      semaphore_.acquire();
+    }
+
+    ~QueueAndBufferAcquirer() {
+      // If any of the worker threads exit early without being able to release
+      // the semaphore, we make sure to release it for them in the main thread.
+      const auto releasing = num_acquisitions_.load(std::memory_order_relaxed);
+      semaphore_.release(releasing);
+      TORCH_CHECK(releasing == 0, "An io_uring worker thread didn't not exit.");
+    }
+
+    std::pair<UniqueQueue, char*> get() {
+      // We consume a slot from the semaphore to use a queue.
+      semaphore_.acquire();
+      num_acquisitions_.fetch_add(1, std::memory_order_relaxed);
+      const auto thread_id = [&] {
+        std::lock_guard lock(available_queues_mtx_);
+        TORCH_CHECK(!available_queues_.empty());
+        const auto thread_id = available_queues_.back();
+        available_queues_.pop_back();
+        return thread_id;
+      }();
+      return {
+          std::piecewise_construct, std::make_tuple(this, thread_id),
+          std::make_tuple(array_->ReadBuffer(thread_id))};
+    }
+
+   private:
+    const OnDiskNpyArray* array_;
+    std::atomic_flag exiting_first_ = ATOMIC_FLAG_INIT;
+    std::atomic<int> num_acquisitions_ = 1;
+  };
+
 #endif  // HAVE_LIBRARY_LIBURING
 };
 

From 4c1e14c6efc8319e51a390e17ba764ea23bda099 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 11 Aug 2024 22:20:23 -0400
Subject: [PATCH 10/78] [GraphBolt][CUDA] Refactor `overlap_graph_fetch`,
 simplify `gb.DataLoader`. (#7681)

---
 .../disk_based_feature/node_classification.py |  11 +-
 examples/graphbolt/node_classification.py     |  10 +-
 .../pyg/labor/node_classification.py          |  11 +-
 .../pyg/node_classification_advanced.py       |  14 +-
 examples/graphbolt/rgcn/hetero_rgcn.py        |  10 +-
 .../multigpu/graphbolt/node_classification.py |  10 +-
 python/dgl/graphbolt/dataloader.py            |  91 +--------
 .../impl/fused_csc_sampling_graph.py          |  31 +++
 python/dgl/graphbolt/impl/neighbor_sampler.py | 188 ++++++++++--------
 .../graphbolt/impl/test_neighbor_sampler.py   |  28 +--
 .../pytorch/graphbolt/test_dataloader.py      |  20 +-
 11 files changed, 187 insertions(+), 237 deletions(-)

diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index afb6a0ac542a..aaca410947cc 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -115,7 +115,10 @@ def create_dataloader(
         else {}
     )
     datapipe = getattr(datapipe, args.sample_mode)(
-        graph, fanout if job != "infer" else [-1], **kwargs
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        **kwargs,
     )
     # Copy the data to the specified device.
     if args.feature_device != "cpu":
@@ -130,11 +133,7 @@ def create_dataloader(
     if args.feature_device == "cpu":
         datapipe = datapipe.copy_to(device=device)
     # Create and return a DataLoader to handle data loading.
-    return gb.DataLoader(
-        datapipe,
-        num_workers=args.num_workers,
-        overlap_graph_fetch=args.overlap_graph_fetch,
-    )
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
 
 
 def train_step(minibatch, optimizer, model, loss_fn):
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index ff9ed2399b23..e5e17de88bde 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -117,7 +117,9 @@ def create_dataloader(
     # Initialize a neighbor sampler for sampling the neighborhoods of nodes.
     ############################################################################
     datapipe = getattr(datapipe, args.sample_mode)(
-        graph, fanout if job != "infer" else [-1]
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.storage_device == "pinned",
     )
 
     ############################################################################
@@ -156,11 +158,7 @@ def create_dataloader(
     # [Role]:
     # Initialize a multi-process dataloader to load the data in parallel.
     ############################################################################
-    dataloader = gb.DataLoader(
-        datapipe,
-        num_workers=num_workers,
-        overlap_graph_fetch=args.storage_device == "pinned",
-    )
+    dataloader = gb.DataLoader(datapipe, num_workers=num_workers)
 
     # Return the fully-initialized DataLoader object.
     return dataloader
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index b799b3de8cbe..09f8cb3cf050 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -147,7 +147,10 @@ def create_dataloader(
         else {}
     )
     datapipe = getattr(datapipe, args.sample_mode)(
-        graph, fanout if job != "infer" else [-1], **kwargs
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        **kwargs,
     )
     # Copy the data to the specified device.
     if args.feature_device != "cpu" and need_copy:
@@ -163,11 +166,7 @@ def create_dataloader(
     if need_copy:
         datapipe = datapipe.copy_to(device=device)
     # Create and return a DataLoader to handle data loading.
-    return gb.DataLoader(
-        datapipe,
-        num_workers=args.num_workers,
-        overlap_graph_fetch=args.overlap_graph_fetch,
-    )
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
 
 
 @torch.compile
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 5066016f7e18..3b066a511b32 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -195,7 +195,11 @@ def create_dataloader(
         need_copy = False
     # Sample neighbors for each node in the mini-batch.
     datapipe = getattr(datapipe, args.sample_mode)(
-        graph, fanout if job != "infer" else [-1]
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        num_gpu_cached_edges=args.num_gpu_cached_edges,
+        gpu_cache_threshold=args.gpu_graph_caching_threshold,
     )
     # Copy the data to the specified device.
     if args.feature_device != "cpu" and need_copy:
@@ -211,13 +215,7 @@ def create_dataloader(
     if need_copy:
         datapipe = datapipe.copy_to(device=device)
     # Create and return a DataLoader to handle data loading.
-    return gb.DataLoader(
-        datapipe,
-        num_workers=args.num_workers,
-        overlap_graph_fetch=args.overlap_graph_fetch,
-        num_gpu_cached_edges=args.num_gpu_cached_edges,
-        gpu_cache_threshold=args.gpu_graph_caching_threshold,
-    )
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
 
 
 @torch.compile
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index d834b84f7d9e..eec00e12f11f 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -124,7 +124,9 @@ def create_dataloader(
     #   The graph(FusedCSCSamplingGraph) from which to sample neighbors.
     # `fanouts`:
     #   The number of neighbors to sample for each node in each layer.
-    datapipe = datapipe.sample_neighbor(graph, fanouts=fanouts)
+    datapipe = datapipe.sample_neighbor(
+        graph, fanouts=fanouts, overlap_fetch=args.overlap_graph_fetch
+    )
 
     # Fetch the features for each node in the mini-batch.
     # `features`:
@@ -141,11 +143,7 @@ def create_dataloader(
     # Create a DataLoader from the datapipe.
     # `num_workers`:
     #   The number of worker processes to use for data loading.
-    return gb.DataLoader(
-        datapipe,
-        num_workers=num_workers,
-        overlap_graph_fetch=args.overlap_graph_fetch,
-    )
+    return gb.DataLoader(datapipe, num_workers=num_workers)
 
 
 def extract_embed(node_embed, input_nodes):
diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index 32da55f782d2..35ae6fcc38d4 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -134,16 +134,14 @@ def create_dataloader(
     ############################################################################
     if args.storage_device != "cpu":
         datapipe = datapipe.copy_to(device)
-    datapipe = datapipe.sample_neighbor(graph, args.fanout)
+    datapipe = datapipe.sample_neighbor(
+        graph, args.fanout, overlap_fetch=args.storage_device == "pinned"
+    )
     datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
     if args.storage_device == "cpu":
         datapipe = datapipe.copy_to(device)
 
-    dataloader = gb.DataLoader(
-        datapipe,
-        args.num_workers,
-        overlap_graph_fetch=args.storage_device == "pinned",
-    )
+    dataloader = gb.DataLoader(datapipe, args.num_workers)
 
     # Return the fully-initialized DataLoader object.
     return dataloader
diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index bb37cf2e9806..d76cb48fa0db 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -1,13 +1,10 @@
 """Graph Bolt DataLoaders"""
 
-from collections import OrderedDict
-
 import torch
 import torch.utils.data as torch_data
 
-from .base import CopyTo, get_host_to_device_uva_stream
+from .base import CopyTo
 from .feature_fetcher import FeatureFetcher, FeatureFetcherStartMarker
-from .impl.gpu_graph_cache import GPUGraphCache
 from .impl.neighbor_sampler import SamplePerLayer
 
 from .internal import (
@@ -22,34 +19,9 @@
 
 __all__ = [
     "DataLoader",
-    "construct_gpu_graph_cache",
 ]
 
 
-def construct_gpu_graph_cache(
-    sample_per_layer_obj, num_gpu_cached_edges, gpu_cache_threshold
-):
-    "Construct a GPUGraphCache given a sample_per_layer_obj and cache parameters."
-    graph = sample_per_layer_obj.sampler.__self__
-    num_gpu_cached_edges = min(num_gpu_cached_edges, graph.total_num_edges)
-    dtypes = OrderedDict()
-    dtypes["indices"] = graph.indices.dtype
-    if graph.type_per_edge is not None:
-        dtypes["type_per_edge"] = graph.type_per_edge.dtype
-    if graph.edge_attributes is not None:
-        probs_or_mask = graph.edge_attributes.get(
-            sample_per_layer_obj.prob_name, None
-        )
-        if probs_or_mask is not None:
-            dtypes["probs_or_mask"] = probs_or_mask.dtype
-    return GPUGraphCache(
-        num_gpu_cached_edges,
-        gpu_cache_threshold,
-        graph.csc_indptr.dtype,
-        list(dtypes.values()),
-    )
-
-
 def _find_and_wrap_parent(datapipe_graph, target_datapipe, wrapper, **kwargs):
     """Find parent of target_datapipe and wrap it with ."""
     datapipes = find_dps(
@@ -125,18 +97,6 @@ class DataLoader(torch_data.DataLoader):
         If True, the data loader will not shut down the worker processes after a
         dataset has been consumed once. This allows to maintain the workers
         instances alive.
-    overlap_graph_fetch : bool, optional
-        If True, the data loader will overlap the UVA graph fetching operations
-        with the rest of operations by using an alternative CUDA stream. This
-        option should be enabled if you have moved your graph to the pinned
-        memory for optimal performance. Default is False.
-    num_gpu_cached_edges : int, optional
-        If positive and overlap_graph_fetch is True, then the GPU will cache
-        frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth
-        demand due to pinned graph accesses.
-    gpu_cache_threshold : int, optional
-        Determines how many times a vertex needs to be accessed before its
-        neighborhood ends up being cached on the GPU.
     max_uva_threads : int, optional
         Limits the number of CUDA threads used for UVA copies so that the rest
         of the computations can run simultaneously with it. Setting it to a too
@@ -150,9 +110,6 @@ def __init__(
         datapipe,
         num_workers=0,
         persistent_workers=True,
-        overlap_graph_fetch=False,
-        num_gpu_cached_edges=0,
-        gpu_cache_threshold=1,
         max_uva_threads=10240,
     ):
         # Multiprocessing requires two modifications to the datapipe:
@@ -200,54 +157,14 @@ def __init__(
                 if feature_fetcher.max_num_stages > 0:  # Overlap enabled.
                     torch.ops.graphbolt.set_max_uva_threads(max_uva_threads)
 
-        if (
-            overlap_graph_fetch
-            and num_workers == 0
-            and torch.cuda.is_available()
-        ):
-            torch.ops.graphbolt.set_max_uva_threads(max_uva_threads)
+        if num_workers == 0 and torch.cuda.is_available():
             samplers = find_dps(
                 datapipe_graph,
                 SamplePerLayer,
             )
-            gpu_graph_cache = None
             for sampler in samplers:
-                if num_gpu_cached_edges > 0 and gpu_graph_cache is None:
-                    gpu_graph_cache = construct_gpu_graph_cache(
-                        sampler, num_gpu_cached_edges, gpu_cache_threshold
-                    )
-                if (
-                    sampler.sampler.__name__ == "sample_layer_neighbors"
-                    or gpu_graph_cache is not None
-                ):
-                    # This code path is not faster for sample_neighbors.
-                    datapipe_graph = replace_dp(
-                        datapipe_graph,
-                        sampler,
-                        sampler.fetch_and_sample(
-                            gpu_graph_cache,
-                            get_host_to_device_uva_stream(),
-                            1,
-                        ),
-                    )
-                elif sampler.sampler.__name__ == "sample_neighbors":
-                    # This code path is faster for sample_neighbors.
-                    datapipe_graph = replace_dp(
-                        datapipe_graph,
-                        sampler,
-                        sampler.datapipe.sample_per_layer(
-                            sampler=sampler.sampler,
-                            fanout=sampler.fanout,
-                            replace=sampler.replace,
-                            prob_name=sampler.prob_name,
-                            returning_indices_is_optional=True,
-                        ),
-                    )
-                else:
-                    raise AssertionError(
-                        "overlap_graph_fetch is supported only for "
-                        "sample_neighbor and sample_layer_neighbor."
-                    )
+                if sampler.overlap_fetch:
+                    torch.ops.graphbolt.set_max_uva_threads(max_uva_threads)
 
         # (4) Cut datapipe at CopyTo and wrap with pinning and prefetching
         # before it. This enables enables non_blocking copies to the device.
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 7288c969d0a7..53cfcc76bbbb 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -10,6 +10,7 @@
 from ..base import etype_str_to_tuple, etype_tuple_to_str, ORIGINAL_EDGE_ID
 from ..internal_utils import gb_warning, is_wsl, recursive_apply
 from ..sampling_graph import SamplingGraph
+from .gpu_graph_cache import GPUGraphCache
 from .sampled_subgraph_impl import CSCFormatBase, SampledSubgraphImpl
 
 
@@ -315,6 +316,14 @@ def _indptr_node_type_offset_list(
         """Sets the indptr node type offset list if present."""
         self._indptr_node_type_offset_list_ = indptr_node_type_offset_list
 
+    @property
+    def _gpu_graph_cache(self) -> Optional[GPUGraphCache]:
+        return (
+            self._gpu_graph_cache_
+            if hasattr(self, "_gpu_graph_cache_")
+            else None
+        )
+
     @property
     def type_per_edge(self) -> Optional[torch.Tensor]:
         """Returns the edge type tensor if present.
@@ -1432,6 +1441,28 @@ def _pin(x):
 
         return self._apply_to_members(_pin)
 
+    def _initialize_gpu_graph_cache(
+        self,
+        num_gpu_cached_edges: int,
+        gpu_cache_threshold: int,
+        prob_name: Optional[str] = None,
+    ):
+        "Construct a GPUGraphCache given the cache parameters."
+        num_gpu_cached_edges = min(num_gpu_cached_edges, self.total_num_edges)
+        dtypes = [self.indices.dtype]
+        if self.type_per_edge is not None:
+            dtypes.append(self.type_per_edge.dtype)
+        if self.edge_attributes is not None:
+            probs_or_mask = self.edge_attributes.get(prob_name, None)
+            if probs_or_mask is not None:
+                dtypes.append(probs_or_mask.dtype)
+        self._gpu_graph_cache_ = GPUGraphCache(
+            num_gpu_cached_edges,
+            gpu_cache_threshold,
+            self.csc_indptr.dtype,
+            dtypes,
+        )
+
 
 def fused_csc_sampling_graph(
     csc_indptr: torch.Tensor,
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 8c997eb6848f..07c891ae545a 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -24,7 +24,6 @@
     "NeighborSampler",
     "LayerNeighborSampler",
     "SamplePerLayer",
-    "SamplePerLayerFromFetchedSubgraph",
     "FetchInsubgraphData",
     "ConcatHeteroSeeds",
     "CombineCachedAndFetchedInSubgraph",
@@ -55,9 +54,9 @@ class CombineCachedAndFetchedInSubgraph(Mapper):
     found inside the GPUGraphCache.
     """
 
-    def __init__(self, datapipe, sample_per_layer_obj):
+    def __init__(self, datapipe, prob_name):
         super().__init__(datapipe, self._combine_per_layer)
-        self.prob_name = sample_per_layer_obj.prob_name
+        self.prob_name = prob_name
 
     def _combine_per_layer(self, minibatch):
         subgraph = minibatch._sliced_sampling_graph
@@ -94,9 +93,9 @@ def _combine_per_layer(self, minibatch):
 class ConcatHeteroSeeds(Mapper):
     """Concatenates the seeds into a single tensor in the hetero case."""
 
-    def __init__(self, datapipe, sample_per_layer_obj):
+    def __init__(self, datapipe, graph):
         super().__init__(datapipe, self._concat)
-        self.graph = sample_per_layer_obj.sampler.__self__
+        self.graph = graph
 
     def _concat(self, minibatch):
         seeds = minibatch._seed_nodes
@@ -124,20 +123,23 @@ class FetchInsubgraphData(Mapper):
     def __init__(
         self,
         datapipe,
-        sample_per_layer_obj,
-        gpu_graph_cache,
-        stream=None,
+        graph,
+        prob_name,
     ):
-        self.graph = sample_per_layer_obj.sampler.__self__
-        datapipe = datapipe.concat_hetero_seeds(sample_per_layer_obj)
-        if gpu_graph_cache is not None:
-            datapipe = datapipe.fetch_cached_insubgraph_data(gpu_graph_cache)
+        datapipe = datapipe.concat_hetero_seeds(graph)
+        if graph._gpu_graph_cache is not None:
+            datapipe = datapipe.fetch_cached_insubgraph_data(
+                graph._gpu_graph_cache
+            )
+        self.graph = graph
+        self.prob_name = prob_name
         super().__init__(datapipe, self._fetch_per_layer)
-        self.prob_name = sample_per_layer_obj.prob_name
-        self.stream = stream
 
-    def _fetch_per_layer_impl(self, minibatch, stream):
-        with torch.cuda.stream(self.stream):
+    def _fetch_per_layer(self, minibatch):
+        stream = torch.cuda.current_stream()
+        uva_stream = get_host_to_device_uva_stream()
+        uva_stream.wait_stream(stream)
+        with torch.cuda.stream(uva_stream):
             seeds = minibatch._seeds
             seed_offsets = minibatch._seed_offsets
             delattr(minibatch, "_seeds")
@@ -146,7 +148,7 @@ def _fetch_per_layer_impl(self, minibatch, stream):
             seeds.record_stream(torch.cuda.current_stream())
 
             def record_stream(tensor):
-                if stream is not None and tensor.is_cuda:
+                if tensor.is_cuda:
                     tensor.record_stream(stream)
                 return tensor
 
@@ -210,49 +212,10 @@ def record_stream(tensor):
             subgraph._indptr_node_type_offset_list = seed_offsets
             minibatch._sliced_sampling_graph = subgraph
 
-            if self.stream is not None:
-                minibatch.wait = torch.cuda.current_stream().record_event().wait
+            minibatch.wait = torch.cuda.current_stream().record_event().wait
 
             return minibatch
 
-    def _fetch_per_layer(self, minibatch):
-        current_stream = None
-        if self.stream is not None:
-            current_stream = torch.cuda.current_stream()
-            self.stream.wait_stream(current_stream)
-        return self._fetch_per_layer_impl(minibatch, current_stream)
-
-
-@functional_datapipe("sample_per_layer_from_fetched_subgraph")
-class SamplePerLayerFromFetchedSubgraph(MiniBatchTransformer):
-    """Sample neighbor edges from a graph for a single layer."""
-
-    def __init__(self, datapipe, sample_per_layer_obj):
-        super().__init__(datapipe, self._sample_per_layer_from_fetched_subgraph)
-        self.sampler_name = sample_per_layer_obj.sampler.__name__
-        self.fanout = sample_per_layer_obj.fanout
-        self.replace = sample_per_layer_obj.replace
-        self.prob_name = sample_per_layer_obj.prob_name
-
-    def _sample_per_layer_from_fetched_subgraph(self, minibatch):
-        subgraph = minibatch._sliced_sampling_graph
-        delattr(minibatch, "_sliced_sampling_graph")
-        kwargs = {
-            key[1:]: getattr(minibatch, key)
-            for key in ["_random_seed", "_seed2_contribution"]
-            if hasattr(minibatch, key)
-        }
-        sampled_subgraph = getattr(subgraph, self.sampler_name)(
-            None,
-            self.fanout,
-            self.replace,
-            self.prob_name,
-            **kwargs,
-        )
-        minibatch.sampled_subgraphs.insert(0, sampled_subgraph)
-
-        return minibatch
-
 
 @functional_datapipe("sample_per_layer")
 class SamplePerLayer(MiniBatchTransformer):
@@ -265,10 +228,16 @@ def __init__(
         fanout,
         replace,
         prob_name,
-        returning_indices_is_optional=False,
+        overlap_fetch,
     ):
         graph = sampler.__self__
-        if returning_indices_is_optional and graph.indices.is_pinned():
+        self.returning_indices_is_optional = False
+        if (
+            overlap_fetch
+            and sampler.__name__ == "sample_neighbors"
+            and graph.indices.is_pinned()
+            and graph._gpu_graph_cache is None
+        ):
             datapipe = datapipe.transform(self._sample_per_layer)
             datapipe = (
                 datapipe.transform(partial(self._fetch_indices, graph.indices))
@@ -285,13 +254,24 @@ def __init__(
                     )
                 )
             super().__init__(datapipe)
+            self.returning_indices_is_optional = True
+        elif overlap_fetch:
+            datapipe = datapipe.fetch_insubgraph_data(graph, prob_name)
+            datapipe = datapipe.buffer().wait()
+            if graph._gpu_graph_cache is not None:
+                datapipe = datapipe.combine_cached_and_fetched_insubgraph(
+                    prob_name
+                )
+            super().__init__(
+                datapipe, self._sample_per_layer_from_fetched_subgraph
+            )
         else:
             super().__init__(datapipe, self._sample_per_layer)
         self.sampler = sampler
         self.fanout = fanout
         self.replace = replace
         self.prob_name = prob_name
-        self.returning_indices_is_optional = returning_indices_is_optional
+        self.overlap_fetch = overlap_fetch
 
     def _sample_per_layer(self, minibatch):
         kwargs = {
@@ -310,6 +290,25 @@ def _sample_per_layer(self, minibatch):
         minibatch.sampled_subgraphs.insert(0, subgraph)
         return minibatch
 
+    def _sample_per_layer_from_fetched_subgraph(self, minibatch):
+        subgraph = minibatch._sliced_sampling_graph
+        delattr(minibatch, "_sliced_sampling_graph")
+        kwargs = {
+            key[1:]: getattr(minibatch, key)
+            for key in ["_random_seed", "_seed2_contribution"]
+            if hasattr(minibatch, key)
+        }
+        sampled_subgraph = getattr(subgraph, self.sampler.__name__)(
+            None,
+            self.fanout,
+            self.replace,
+            self.prob_name,
+            **kwargs,
+        )
+        minibatch.sampled_subgraphs.insert(0, sampled_subgraph)
+
+        return minibatch
+
     @staticmethod
     def _fetch_indices(indices, minibatch):
         stream = torch.cuda.current_stream()
@@ -398,27 +397,6 @@ def _compact_per_layer(self, minibatch):
         return minibatch
 
 
-@functional_datapipe("fetch_and_sample")
-class FetcherAndSampler(MiniBatchTransformer):
-    """Overlapped graph sampling operation replacement."""
-
-    def __init__(
-        self,
-        sampler,
-        gpu_graph_cache,
-        stream,
-        buffer_size,
-    ):
-        datapipe = sampler.datapipe.fetch_insubgraph_data(
-            sampler, gpu_graph_cache, stream
-        )
-        datapipe = datapipe.buffer(buffer_size).wait()
-        if gpu_graph_cache is not None:
-            datapipe = datapipe.combine_cached_and_fetched_insubgraph(sampler)
-        datapipe = datapipe.sample_per_layer_from_fetched_subgraph(sampler)
-        super().__init__(datapipe)
-
-
 class NeighborSamplerImpl(SubgraphSampler):
     # pylint: disable=abstract-method
     """Base class for NeighborSamplers."""
@@ -433,9 +411,17 @@ def __init__(
         prob_name,
         deduplicate,
         sampler,
+        overlap_fetch,
+        num_gpu_cached_edges,
+        gpu_cache_threshold,
         layer_dependency=None,
         batch_dependency=None,
     ):
+        if overlap_fetch and num_gpu_cached_edges > 0:
+            if graph._gpu_graph_cache is None:
+                graph._initialize_gpu_graph_cache(
+                    num_gpu_cached_edges, gpu_cache_threshold, prob_name
+                )
         if sampler.__name__ == "sample_layer_neighbors":
             self._init_seed(batch_dependency)
         super().__init__(
@@ -446,6 +432,7 @@ def __init__(
             prob_name,
             deduplicate,
             sampler,
+            overlap_fetch,
             layer_dependency,
         )
 
@@ -520,6 +507,7 @@ def sampling_stages(
         prob_name,
         deduplicate,
         sampler,
+        overlap_fetch,
         layer_dependency,
     ):
         datapipe = datapipe.transform(
@@ -533,7 +521,7 @@ def sampling_stages(
             if not isinstance(fanout, torch.Tensor):
                 fanout = torch.LongTensor([int(fanout)])
             datapipe = datapipe.sample_per_layer(
-                sampler, fanout, replace, prob_name
+                sampler, fanout, replace, prob_name, overlap_fetch
             )
             datapipe = datapipe.compact_per_layer(deduplicate)
             if is_labor and not layer_dependency:
@@ -589,6 +577,18 @@ class NeighborSampler(NeighborSamplerImpl):
         Boolean indicating whether seeds between hops will be deduplicated.
         If True, the same elements in seeds will be deleted to only one.
         Otherwise, the same elements will be remained.
+    overlap_fetch : bool, optional
+        If True, the data loader will overlap the UVA graph fetching operations
+        with the rest of operations by using an alternative CUDA stream. This
+        option should be enabled if you have moved your graph to the pinned
+        memory for optimal performance. Default is False.
+    num_gpu_cached_edges : int, optional
+        If positive and overlap_graph_fetch is True, then the GPU will cache
+        frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth
+        demand due to pinned graph accesses.
+    gpu_cache_threshold : int, optional
+        Determines how many times a vertex needs to be accessed before its
+        neighborhood ends up being cached on the GPU.
 
     Examples
     -------
@@ -638,6 +638,9 @@ def __init__(
         replace=False,
         prob_name=None,
         deduplicate=True,
+        overlap_fetch=False,
+        num_gpu_cached_edges=0,
+        gpu_cache_threshold=1,
     ):
         super().__init__(
             datapipe,
@@ -647,6 +650,9 @@ def __init__(
             prob_name,
             deduplicate,
             graph.sample_neighbors,
+            overlap_fetch,
+            num_gpu_cached_edges,
+            gpu_cache_threshold,
         )
 
 
@@ -718,6 +724,18 @@ class LayerNeighborSampler(NeighborSamplerImpl):
         the random variates proportional to :math:`\\frac{1}{\\kappa}`. Implements
         the dependent minibatching approach in `arXiv:2310.12403
         <https://arxiv.org/abs/2310.12403>`__.
+    overlap_fetch : bool, optional
+        If True, the data loader will overlap the UVA graph fetching operations
+        with the rest of operations by using an alternative CUDA stream. This
+        option should be enabled if you have moved your graph to the pinned
+        memory for optimal performance. Default is False.
+    num_gpu_cached_edges : int, optional
+        If positive and overlap_graph_fetch is True, then the GPU will cache
+        frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth
+        demand due to pinned graph accesses.
+    gpu_cache_threshold : int, optional
+        Determines how many times a vertex needs to be accessed before its
+        neighborhood ends up being cached on the GPU.
 
     Examples
     -------
@@ -776,6 +794,9 @@ def __init__(
         deduplicate=True,
         layer_dependency=False,
         batch_dependency=1,
+        overlap_fetch=False,
+        num_gpu_cached_edges=0,
+        gpu_cache_threshold=1,
     ):
         super().__init__(
             datapipe,
@@ -785,6 +806,9 @@ def __init__(
             prob_name,
             deduplicate,
             graph.sample_layer_neighbors,
+            overlap_fetch,
+            num_gpu_cached_edges,
+            gpu_cache_threshold,
             layer_dependency,
             batch_dependency,
         )
diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
index 6c5818bbf601..3f827923e2f5 100644
--- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -7,7 +7,6 @@
 import dgl.graphbolt as gb
 import pytest
 import torch
-from dgl.graphbolt.dataloader import construct_gpu_graph_cache
 
 
 def get_hetero_graph():
@@ -72,21 +71,11 @@ def test_NeighborSampler_GraphFetch(
     compact_per_layer = sample_per_layer.compact_per_layer(True)
     gb.seed(123)
     expected_results = list(compact_per_layer)
-    gpu_graph_cache = None
     if num_cached_edges > 0:
-        gpu_graph_cache = construct_gpu_graph_cache(
-            sample_per_layer, num_cached_edges, 1
-        )
-    datapipe = gb.FetchInsubgraphData(
-        datapipe,
-        sample_per_layer,
-        gpu_graph_cache,
+        graph._initialize_gpu_graph_cache(num_cached_edges, 1, prob_name)
+    datapipe = datapipe.sample_per_layer(
+        graph.sample_neighbors, fanout, False, prob_name, True
     )
-    if num_cached_edges > 0:
-        datapipe = gb.CombineCachedAndFetchedInSubgraph(
-            datapipe, sample_per_layer
-        )
-    datapipe = gb.SamplePerLayerFromFetchedSubgraph(datapipe, sample_per_layer)
     datapipe = datapipe.compact_per_layer(True)
     gb.seed(123)
     new_results = list(datapipe)
@@ -99,10 +88,10 @@ def remove_input_nodes(minibatch):
         return minibatch
 
     datapipe = item_sampler.sample_neighbor(
-        graph, [fanout], False, prob_name=prob_name
+        graph, [fanout], False, prob_name=prob_name, overlap_fetch=True
     )
     datapipe = datapipe.transform(remove_input_nodes)
-    dataloader = gb.DataLoader(datapipe, overlap_graph_fetch=True)
+    dataloader = gb.DataLoader(datapipe)
     gb.seed(123)
     new_results = list(dataloader)
     assert len(expected_results) == len(new_results)
@@ -113,6 +102,8 @@ def remove_input_nodes(minibatch):
 @pytest.mark.parametrize("layer_dependency", [False, True])
 @pytest.mark.parametrize("overlap_graph_fetch", [False, True])
 def test_labor_dependent_minibatching(layer_dependency, overlap_graph_fetch):
+    if F._default_context_str != "gpu" and overlap_graph_fetch:
+        pytest.skip("overlap_graph_fetch is only available for GPU.")
     num_edges = 200
     csc_indptr = torch.cat(
         (
@@ -133,12 +124,11 @@ def test_labor_dependent_minibatching(layer_dependency, overlap_graph_fetch):
     datapipe = datapipe.sample_layer_neighbor(
         graph,
         fanouts,
+        overlap_fetch=overlap_graph_fetch,
         layer_dependency=layer_dependency,
         batch_dependency=batch_dependency,
     )
-    dataloader = gb.DataLoader(
-        datapipe, overlap_graph_fetch=overlap_graph_fetch
-    )
+    dataloader = gb.DataLoader(datapipe)
     res = list(dataloader)
     assert len(res) == batch_dependency + 1
     if layer_dependency:
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index ba22bfdda293..93045c0113e9 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -104,10 +104,18 @@ def test_gpu_sampling_DataLoader(
     for i in range(2):
         datapipe = dgl.graphbolt.ItemSampler(itemset, batch_size=B)
         datapipe = datapipe.copy_to(F.ctx())
+        kwargs = {
+            "overlap_fetch": overlap_graph_fetch,
+            "num_gpu_cached_edges": num_gpu_cached_edges,
+            "gpu_cache_threshold": gpu_cache_threshold,
+        }
+        if i != 0:
+            kwargs = {}
         datapipe = getattr(dgl.graphbolt, sampler_name)(
             datapipe,
             graph,
             fanouts=[torch.LongTensor([2]) for _ in range(num_layers)],
+            **kwargs
         )
         if enable_feature_fetch:
             datapipe = dgl.graphbolt.FeatureFetcher(
@@ -117,17 +125,7 @@ def test_gpu_sampling_DataLoader(
                 ["d"],
                 overlap_fetch=overlap_feature_fetch and i == 0,
             )
-        if i == 0:
-            dataloaders.append(
-                dgl.graphbolt.DataLoader(
-                    datapipe,
-                    overlap_graph_fetch=overlap_graph_fetch,
-                    num_gpu_cached_edges=num_gpu_cached_edges,
-                    gpu_cache_threshold=gpu_cache_threshold,
-                )
-            )
-        else:
-            dataloaders.append(dgl.graphbolt.DataLoader(datapipe))
+        dataloaders.append(dgl.graphbolt.DataLoader(datapipe))
     dataloader, dataloader2 = dataloaders
 
     bufferer_cnt = int(enable_feature_fetch and overlap_feature_fetch)

From bdbba5fafa9673dc00a2d5d433faebb6f3d5ebb6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 12 Aug 2024 17:38:52 -0400
Subject: [PATCH 11/78] [GraphBolt][io_uring] Remove redundant mechanism.
 (#7686)

---
 graphbolt/src/cnumpy.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index 3f7bd4a99401..76984b7e7253 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -161,10 +161,11 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
         }
         // If this is the first thread exiting, release the master thread's
         // ticket as well by releasing 2 slots. Otherwise, release 1 slot.
-        const auto releasing = acquirer_->exiting_first_.test_and_set() ? 1 : 2;
+        const auto releasing =
+            acquirer_->exiting_first_.test_and_set(std::memory_order_relaxed)
+                ? 1
+                : 2;
         semaphore_.release(releasing);
-        acquirer_->num_acquisitions_.fetch_add(
-            -releasing, std::memory_order_relaxed);
       }
 
       ::io_uring& get() const { return io_uring_queue_[thread_id_]; }
@@ -179,17 +180,16 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
     }
 
     ~QueueAndBufferAcquirer() {
-      // If any of the worker threads exit early without being able to release
-      // the semaphore, we make sure to release it for them in the main thread.
-      const auto releasing = num_acquisitions_.load(std::memory_order_relaxed);
+      // If none of the worker threads acquire the semaphore, we make sure to
+      // release the ticket taken in the constructor.
+      const auto releasing =
+          exiting_first_.test_and_set(std::memory_order_relaxed) ? 0 : 1;
       semaphore_.release(releasing);
-      TORCH_CHECK(releasing == 0, "An io_uring worker thread didn't not exit.");
     }
 
     std::pair<UniqueQueue, char*> get() {
       // We consume a slot from the semaphore to use a queue.
       semaphore_.acquire();
-      num_acquisitions_.fetch_add(1, std::memory_order_relaxed);
       const auto thread_id = [&] {
         std::lock_guard lock(available_queues_mtx_);
         TORCH_CHECK(!available_queues_.empty());
@@ -205,7 +205,6 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
    private:
     const OnDiskNpyArray* array_;
     std::atomic_flag exiting_first_ = ATOMIC_FLAG_INIT;
-    std::atomic<int> num_acquisitions_ = 1;
   };
 
 #endif  // HAVE_LIBRARY_LIBURING

From ff088ac9991ec81e432a8629d9553cdead59aa8e Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 13 Aug 2024 07:48:04 -0400
Subject: [PATCH 12/78] [GraphBolt][io_uring] Call semaphore constructor.
 (#7689)

---
 graphbolt/src/cnumpy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index 76984b7e7253..c33958312584 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -141,8 +141,8 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
   static inline int num_queues_;  // Number of queues.
   static inline std::unique_ptr<::io_uring[], io_uring_queue_destroyer>
       io_uring_queue_;  // io_uring queue.
-  static inline counting_semaphore_t
-      semaphore_;  // Control access to the io_uring queues.
+  static inline counting_semaphore_t semaphore_{
+      0};  // Control access to the io_uring queues.
   static inline std::mutex available_queues_mtx_;  // available_queues_ mutex.
   static inline std::vector<int> available_queues_;
 

From d650422402aa770fbf7ec05e7230405c45a3bdfa Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 13 Aug 2024 11:39:44 -0400
Subject: [PATCH 13/78] [GraphBolt][CUDA] Use same CUDA stream in async op.
 (#7693)

---
 graphbolt/include/graphbolt/async.h | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/graphbolt/include/graphbolt/async.h b/graphbolt/include/graphbolt/async.h
index d0277ffc426c..3e9480fd2516 100644
--- a/graphbolt/include/graphbolt/async.h
+++ b/graphbolt/include/graphbolt/async.h
@@ -23,16 +23,22 @@
 #include <ATen/Parallel.h>
 #include <torch/script.h>
 
-#include <atomic>
-#include <exception>
 #include <future>
 #include <memory>
 #include <mutex>
-#include <type_traits>
 
 #ifdef BUILD_WITH_TASKFLOW
 #include <taskflow/algorithm/for_each.hpp>
 #include <taskflow/taskflow.hpp>
+#else
+#include <atomic>
+#include <exception>
+#include <type_traits>
+#endif
+
+#ifdef GRAPHBOLT_USE_CUDA
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
 #endif
 
 namespace graphbolt {
@@ -104,12 +110,23 @@ class Future : public torch::CustomClassHolder {
 template <typename F>
 inline auto async(F&& function) {
   using T = decltype(function());
+#ifdef GRAPHBOLT_USE_CUDA
+  auto stream = c10::cuda::getCurrentCUDAStream();
+#endif
+  auto fn = [=, func = std::move(function)] {
+#ifdef GRAPHBOLT_USE_CUDA
+    // We make sure to use the same CUDA stream as the thread launching the
+    // async operation.
+    c10::cuda::CUDAStreamGuard guard(stream);
+#endif
+    return func();
+  };
 #ifdef BUILD_WITH_TASKFLOW
-  auto future = interop_pool().async(std::move(function));
+  auto future = interop_pool().async(std::move(fn));
 #else
   auto promise = std::make_shared<std::promise<T>>();
   auto future = promise->get_future();
-  at::launch([promise, func = std::move(function)]() {
+  at::launch([promise, func = std::move(fn)]() {
     if constexpr (std::is_void_v<T>) {
       func();
       promise->set_value();

From b5026267155b39203abce1e5528af90bad4301a7 Mon Sep 17 00:00:00 2001
From: pyynb <52124938+pyynb@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:33:56 +0800
Subject: [PATCH 14/78] [docs] Fix dgl doc display problem. (#7691)

---
 docs/source/conf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 440e40af5651..74151939cb33 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -81,7 +81,10 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
+exclude_patterns = [
+    "tutorials/**/*.ipynb",
+    "tutorials/**/*.py",
+]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None

From 907a70a0a1b434b254c81880e4f19cc5cef669ee Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 13 Aug 2024 22:10:14 -0400
Subject: [PATCH 15/78] [GraphBolt] Remove unused output from `InSubgraph`.
 (#7694)

---
 graphbolt/src/fused_csc_sampling_graph.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index c5bfd6fd0de8..fe1f64d22668 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -307,8 +307,9 @@ c10::intrusive_ptr<FusedSampledSubgraph> FusedCSCSamplingGraph::InSubgraph(
   }
 
   return c10::make_intrusive<FusedSampledSubgraph>(
-      output_indptr, results.at(0), results.back(), nodes,
-      torch::arange(0, NumNodes()), type_per_edge);
+      // original_row_node_ids is not computed here and is unused.
+      output_indptr, results.at(0), results.back(), nodes, torch::nullopt,
+      type_per_edge);
 }
 
 /**

From 4c9f544ca3eebdc82eff5145c58ab88a63f27e2d Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 14 Aug 2024 16:55:40 -0400
Subject: [PATCH 16/78] [GraphBolt][io_uring] Fix io_uring queue distribution
 bug. (#7698)

---
 graphbolt/src/cnumpy.cc |  5 ++---
 graphbolt/src/cnumpy.h  | 26 ++++++++++----------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/graphbolt/src/cnumpy.cc b/graphbolt/src/cnumpy.cc
index 98e10ad8e71c..26db0ff1a840 100644
--- a/graphbolt/src/cnumpy.cc
+++ b/graphbolt/src/cnumpy.cc
@@ -67,9 +67,8 @@ OnDiskNpyArray::OnDiskNpyArray(
     TORCH_CHECK(num_queues_ > 0, "A positive # queues is required.");
     io_uring_queue_ = std::unique_ptr<::io_uring[], io_uring_queue_destroyer>(
         new ::io_uring[num_queues_], io_uring_queue_destroyer{num_queues_});
-    TORCH_CHECK(num_queues_ + 1 <= counting_semaphore_t::max());
-    // The +1 is for the thread that calls parallel_for.
-    semaphore_.release(num_queues_ + 1);
+    TORCH_CHECK(num_queues_ <= counting_semaphore_t::max());
+    semaphore_.release(num_queues_);
     available_queues_.reserve(num_queues_);
     // Init io_uring queue.
     for (int64_t t = 0; t < num_queues_; t++) {
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index c33958312584..f853ab70d9ae 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -148,8 +148,7 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
 
   struct QueueAndBufferAcquirer {
     struct UniqueQueue {
-      UniqueQueue(QueueAndBufferAcquirer* acquirer, int thread_id)
-          : acquirer_(acquirer), thread_id_(thread_id) {}
+      UniqueQueue(int thread_id) : thread_id_(thread_id) {}
       UniqueQueue(const UniqueQueue&) = delete;
       UniqueQueue& operator=(const UniqueQueue&) = delete;
 
@@ -159,19 +158,12 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
           std::lock_guard lock(available_queues_mtx_);
           available_queues_.push_back(thread_id_);
         }
-        // If this is the first thread exiting, release the master thread's
-        // ticket as well by releasing 2 slots. Otherwise, release 1 slot.
-        const auto releasing =
-            acquirer_->exiting_first_.test_and_set(std::memory_order_relaxed)
-                ? 1
-                : 2;
-        semaphore_.release(releasing);
+        semaphore_.release();
       }
 
       ::io_uring& get() const { return io_uring_queue_[thread_id_]; }
 
      private:
-      QueueAndBufferAcquirer* acquirer_;
       int thread_id_;
     };
 
@@ -182,14 +174,16 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
     ~QueueAndBufferAcquirer() {
       // If none of the worker threads acquire the semaphore, we make sure to
       // release the ticket taken in the constructor.
-      const auto releasing =
-          exiting_first_.test_and_set(std::memory_order_relaxed) ? 0 : 1;
-      semaphore_.release(releasing);
+      if (!entering_first_.test_and_set(std::memory_order_relaxed)) {
+        semaphore_.release();
+      }
     }
 
     std::pair<UniqueQueue, char*> get() {
       // We consume a slot from the semaphore to use a queue.
-      semaphore_.acquire();
+      if (entering_first_.test_and_set(std::memory_order_relaxed)) {
+        semaphore_.acquire();
+      }
       const auto thread_id = [&] {
         std::lock_guard lock(available_queues_mtx_);
         TORCH_CHECK(!available_queues_.empty());
@@ -198,13 +192,13 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
         return thread_id;
       }();
       return {
-          std::piecewise_construct, std::make_tuple(this, thread_id),
+          std::piecewise_construct, std::make_tuple(thread_id),
           std::make_tuple(array_->ReadBuffer(thread_id))};
     }
 
    private:
     const OnDiskNpyArray* array_;
-    std::atomic_flag exiting_first_ = ATOMIC_FLAG_INIT;
+    std::atomic_flag entering_first_ = ATOMIC_FLAG_INIT;
   };
 
 #endif  // HAVE_LIBRARY_LIBURING

From 60d0b661821f7cd974d5c9016962e086a78154e7 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 14 Aug 2024 17:51:36 -0400
Subject: [PATCH 17/78] [GraphBolt][CUDA] Async sample neighbors and
 compaction. (#7682)

---
 .../graphbolt/fused_csc_sampling_graph.h      | 11 +++
 .../include/graphbolt/unique_and_compact.h    |  8 ++
 graphbolt/src/fused_csc_sampling_graph.cc     | 17 ++++
 graphbolt/src/python_binding.cc               | 14 ++++
 graphbolt/src/unique_and_compact.cc           | 13 +++-
 .../impl/fused_csc_sampling_graph.py          | 69 ++++++++++++++--
 python/dgl/graphbolt/impl/neighbor_sampler.py | 78 +++++++++++++++++--
 python/dgl/graphbolt/internal/sample_utils.py | 72 +++++++++++------
 .../pytorch/graphbolt/test_dataloader.py      |  5 ++
 9 files changed, 247 insertions(+), 40 deletions(-)

diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
index 6b6b076d5934..6121fe507f60 100644
--- a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
+++ b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h
@@ -6,6 +6,7 @@
 #ifndef GRAPHBOLT_CSC_SAMPLING_GRAPH_H_
 #define GRAPHBOLT_CSC_SAMPLING_GRAPH_H_
 
+#include <graphbolt/async.h>
 #include <graphbolt/continuous_seed.h>
 #include <graphbolt/fused_sampled_subgraph.h>
 #include <graphbolt/shared_memory.h>
@@ -362,6 +363,16 @@ class FusedCSCSamplingGraph : public torch::CustomClassHolder {
       torch::optional<torch::Tensor> random_seed,
       double seed2_contribution) const;
 
+  c10::intrusive_ptr<Future<c10::intrusive_ptr<FusedSampledSubgraph>>>
+  SampleNeighborsAsync(
+      torch::optional<torch::Tensor> seeds,
+      torch::optional<std::vector<int64_t>> seed_offsets,
+      const std::vector<int64_t>& fanouts, bool replace, bool layer,
+      bool returning_indices_is_optional,
+      torch::optional<torch::Tensor> probs_or_mask,
+      torch::optional<torch::Tensor> random_seed,
+      double seed2_contribution) const;
+
   /**
    * @brief Sample neighboring edges of the given nodes with a temporal
    * constraint. If `node_timestamp_attr_name` or `edge_timestamp_attr_name` is
diff --git a/graphbolt/include/graphbolt/unique_and_compact.h b/graphbolt/include/graphbolt/unique_and_compact.h
index fb3d41d67303..bf3679688c75 100644
--- a/graphbolt/include/graphbolt/unique_and_compact.h
+++ b/graphbolt/include/graphbolt/unique_and_compact.h
@@ -7,6 +7,7 @@
 #ifndef GRAPHBOLT_UNIQUE_AND_COMPACT_H_
 #define GRAPHBOLT_UNIQUE_AND_COMPACT_H_
 
+#include <graphbolt/async.h>
 #include <torch/torch.h>
 
 namespace graphbolt {
@@ -56,6 +57,13 @@ UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& dst_ids,
     const std::vector<torch::Tensor> unique_dst_ids);
 
+c10::intrusive_ptr<Future<
+    std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
+UniqueAndCompactBatchedAsync(
+    const std::vector<torch::Tensor>& src_ids,
+    const std::vector<torch::Tensor>& dst_ids,
+    const std::vector<torch::Tensor> unique_dst_ids);
+
 }  // namespace sampling
 }  // namespace graphbolt
 
diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index fe1f64d22668..a2a4778422f7 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -871,6 +871,23 @@ c10::intrusive_ptr<FusedSampledSubgraph> FusedCSCSamplingGraph::SampleNeighbors(
   }
 }
 
+c10::intrusive_ptr<Future<c10::intrusive_ptr<FusedSampledSubgraph>>>
+FusedCSCSamplingGraph::SampleNeighborsAsync(
+    torch::optional<torch::Tensor> seeds,
+    torch::optional<std::vector<int64_t>> seed_offsets,
+    const std::vector<int64_t>& fanouts, bool replace, bool layer,
+    bool returning_indices_is_optional,
+    torch::optional<torch::Tensor> probs_or_mask,
+    torch::optional<torch::Tensor> random_seed,
+    double seed2_contribution) const {
+  return async([=] {
+    return this->SampleNeighbors(
+        seeds, seed_offsets, fanouts, replace, layer,
+        returning_indices_is_optional, probs_or_mask, random_seed,
+        seed2_contribution);
+  });
+}
+
 c10::intrusive_ptr<FusedSampledSubgraph>
 FusedCSCSamplingGraph::TemporalSampleNeighbors(
     const torch::optional<torch::Tensor>& seeds,
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index e447c60f9617..aa7101b70df2 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -48,6 +48,16 @@ TORCH_LIBRARY(graphbolt, m) {
       .def("wait", &Future<torch::Tensor>::Wait);
   m.class_<Future<std::vector<torch::Tensor>>>("TensorListFuture")
       .def("wait", &Future<std::vector<torch::Tensor>>::Wait);
+  m.class_<Future<c10::intrusive_ptr<FusedSampledSubgraph>>>(
+       "FusedSampledSubgraphFuture")
+      .def("wait", &Future<c10::intrusive_ptr<FusedSampledSubgraph>>::Wait);
+  m.class_<Future<
+      std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>(
+       "UniqueAndCompactBatchedFuture")
+      .def(
+          "wait",
+          &Future<std::vector<
+              std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>::Wait);
   m.class_<storage::OnDiskNpyArray>("OnDiskNpyArray")
       .def("index_select", &storage::OnDiskNpyArray::IndexSelect);
   m.class_<FusedCSCSamplingGraph>("FusedCSCSamplingGraph")
@@ -75,6 +85,9 @@ TORCH_LIBRARY(graphbolt, m) {
       .def("add_edge_attribute", &FusedCSCSamplingGraph::AddEdgeAttribute)
       .def("in_subgraph", &FusedCSCSamplingGraph::InSubgraph)
       .def("sample_neighbors", &FusedCSCSamplingGraph::SampleNeighbors)
+      .def(
+          "sample_neighbors_async",
+          &FusedCSCSamplingGraph::SampleNeighborsAsync)
       .def(
           "temporal_sample_neighbors",
           &FusedCSCSamplingGraph::TemporalSampleNeighbors)
@@ -150,6 +163,7 @@ TORCH_LIBRARY(graphbolt, m) {
       "load_from_shared_memory", &FusedCSCSamplingGraph::LoadFromSharedMemory);
   m.def("unique_and_compact", &UniqueAndCompact);
   m.def("unique_and_compact_batched", &UniqueAndCompactBatched);
+  m.def("unique_and_compact_batched_async", &UniqueAndCompactBatchedAsync);
   m.def("isin", &IsIn);
   m.def("index_select", &ops::IndexSelect);
   m.def("index_select_async", &ops::IndexSelectAsync);
diff --git a/graphbolt/src/unique_and_compact.cc b/graphbolt/src/unique_and_compact.cc
index 4b0f1e266f5a..7dd4a007b4b4 100644
--- a/graphbolt/src/unique_and_compact.cc
+++ b/graphbolt/src/unique_and_compact.cc
@@ -8,8 +8,6 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/unique_and_compact.h>
 
-#include <unordered_map>
-
 #include "./concurrent_id_hash_map.h"
 #include "./macro.h"
 #include "./utils.h"
@@ -67,5 +65,16 @@ UniqueAndCompactBatched(
   return results;
 }
 
+c10::intrusive_ptr<Future<
+    std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
+UniqueAndCompactBatchedAsync(
+    const std::vector<torch::Tensor>& src_ids,
+    const std::vector<torch::Tensor>& dst_ids,
+    const std::vector<torch::Tensor> unique_dst_ids) {
+  return async([=] {
+    return UniqueAndCompactBatched(src_ids, dst_ids, unique_dst_ids);
+  });
+}
+
 }  // namespace sampling
 }  // namespace graphbolt
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 53cfcc76bbbb..79c28853d5d0 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -22,6 +22,22 @@
 ]
 
 
+class _SampleNeighborsWaiter:
+    def __init__(self, fn, future, seed_offsets):
+        self.fn = fn
+        self.future = future
+        self.seed_offsets = seed_offsets
+
+    def wait(self):
+        """Returns the stored value when invoked."""
+        fn = self.fn
+        C_sampled_subgraph = self.future.wait()
+        seed_offsets = self.seed_offsets
+        # Ensure there is no memory leak.
+        self.fn = self.future = self.seed_offsets = None
+        return fn(C_sampled_subgraph, seed_offsets)
+
+
 class FusedCSCSamplingGraph(SamplingGraph):
     r"""A sampling graph in CSC format."""
 
@@ -705,6 +721,7 @@ def sample_neighbors(
         replace: bool = False,
         probs_name: Optional[str] = None,
         returning_indices_is_optional: bool = False,
+        async_op: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -748,6 +765,9 @@ def sample_neighbors(
             Boolean indicating whether it is okay for the call to this function
             to leave the indices tensor uninitialized. In this case, it is the
             user's responsibility to gather it using the edge ids.
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -792,10 +812,18 @@ def sample_neighbors(
             replace=replace,
             probs_or_mask=probs_or_mask,
             returning_indices_is_optional=returning_indices_is_optional,
+            async_op=async_op,
         )
-        return self._convert_to_sampled_subgraph(
-            C_sampled_subgraph, seed_offsets
-        )
+        if async_op:
+            return _SampleNeighborsWaiter(
+                self._convert_to_sampled_subgraph,
+                C_sampled_subgraph,
+                seed_offsets,
+            )
+        else:
+            return self._convert_to_sampled_subgraph(
+                C_sampled_subgraph, seed_offsets
+            )
 
     def _check_sampler_arguments(self, nodes, fanouts, probs_or_mask):
         if nodes is not None:
@@ -844,6 +872,7 @@ def _sample_neighbors(
         replace: bool = False,
         probs_or_mask: Optional[torch.Tensor] = None,
         returning_indices_is_optional: bool = False,
+        async_op: bool = False,
     ) -> torch.ScriptObject:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -886,6 +915,9 @@ def _sample_neighbors(
             Boolean indicating whether it is okay for the call to this function
             to leave the indices tensor uninitialized. In this case, it is the
             user's responsibility to gather it using the edge ids.
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -894,7 +926,12 @@ def _sample_neighbors(
         """
         # Ensure nodes is 1-D tensor.
         self._check_sampler_arguments(seeds, fanouts, probs_or_mask)
-        return self._c_csc_graph.sample_neighbors(
+        sampling_fn = (
+            self._c_csc_graph.sample_neighbors_async
+            if async_op
+            else self._c_csc_graph.sample_neighbors
+        )
+        return sampling_fn(
             seeds,
             seed_offsets,
             fanouts.tolist(),
@@ -915,6 +952,7 @@ def sample_layer_neighbors(
         returning_indices_is_optional: bool = False,
         random_seed: torch.Tensor = None,
         seed2_contribution: float = 0.0,
+        async_op: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph via layer-neighbor sampling from the NeurIPS 2023 paper
@@ -986,6 +1024,9 @@ def sample_layer_neighbors(
             A float value between [0, 1) that determines the contribution of the
             second random seed, ``random_seed[-1]``, to generate the random
             variates.
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -1033,7 +1074,12 @@ def sample_layer_neighbors(
             seed_offsets = self._indptr_node_type_offset_list
         probs_or_mask = self.edge_attributes[probs_name] if probs_name else None
         self._check_sampler_arguments(seeds, fanouts, probs_or_mask)
-        C_sampled_subgraph = self._c_csc_graph.sample_neighbors(
+        sampling_fn = (
+            self._c_csc_graph.sample_neighbors_async
+            if async_op
+            else self._c_csc_graph.sample_neighbors
+        )
+        C_sampled_subgraph = sampling_fn(
             seeds,
             seed_offsets,
             fanouts.tolist(),
@@ -1044,9 +1090,16 @@ def sample_layer_neighbors(
             random_seed,
             seed2_contribution,
         )
-        return self._convert_to_sampled_subgraph(
-            C_sampled_subgraph, seed_offsets
-        )
+        if async_op:
+            return _SampleNeighborsWaiter(
+                self._convert_to_sampled_subgraph,
+                C_sampled_subgraph,
+                seed_offsets,
+            )
+        else:
+            return self._convert_to_sampled_subgraph(
+                C_sampled_subgraph, seed_offsets
+            )
 
     def temporal_sample_neighbors(
         self,
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 07c891ae545a..b11f79ad5ae3 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -229,6 +229,7 @@ def __init__(
         replace,
         prob_name,
         overlap_fetch,
+        asynchronous=False,
     ):
         graph = sampler.__self__
         self.returning_indices_is_optional = False
@@ -239,6 +240,9 @@ def __init__(
             and graph._gpu_graph_cache is None
         ):
             datapipe = datapipe.transform(self._sample_per_layer)
+            if asynchronous:
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(self._wait_subgraph_future)
             datapipe = (
                 datapipe.transform(partial(self._fetch_indices, graph.indices))
                 .buffer()
@@ -253,7 +257,6 @@ def __init__(
                         graph.node_type_to_id,
                     )
                 )
-            super().__init__(datapipe)
             self.returning_indices_is_optional = True
         elif overlap_fetch:
             datapipe = datapipe.fetch_insubgraph_data(graph, prob_name)
@@ -262,16 +265,24 @@ def __init__(
                 datapipe = datapipe.combine_cached_and_fetched_insubgraph(
                     prob_name
                 )
-            super().__init__(
-                datapipe, self._sample_per_layer_from_fetched_subgraph
+            datapipe = datapipe.transform(
+                self._sample_per_layer_from_fetched_subgraph
             )
+            if asynchronous:
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(self._wait_subgraph_future)
         else:
-            super().__init__(datapipe, self._sample_per_layer)
+            datapipe = datapipe.transform(self._sample_per_layer)
+            if asynchronous:
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(self._wait_subgraph_future)
+        super().__init__(datapipe)
         self.sampler = sampler
         self.fanout = fanout
         self.replace = replace
         self.prob_name = prob_name
         self.overlap_fetch = overlap_fetch
+        self.asynchronous = asynchronous
 
     def _sample_per_layer(self, minibatch):
         kwargs = {
@@ -285,6 +296,7 @@ def _sample_per_layer(self, minibatch):
             self.replace,
             self.prob_name,
             self.returning_indices_is_optional,
+            async_op=self.asynchronous,
             **kwargs,
         )
         minibatch.sampled_subgraphs.insert(0, subgraph)
@@ -303,10 +315,15 @@ def _sample_per_layer_from_fetched_subgraph(self, minibatch):
             self.fanout,
             self.replace,
             self.prob_name,
+            async_op=self.asynchronous,
             **kwargs,
         )
         minibatch.sampled_subgraphs.insert(0, sampled_subgraph)
+        return minibatch
 
+    @staticmethod
+    def _wait_subgraph_future(minibatch):
+        minibatch.sampled_subgraphs[0] = minibatch.sampled_subgraphs[0].wait()
         return minibatch
 
     @staticmethod
@@ -363,9 +380,14 @@ def _subtract_hetero_indices_offset(
 class CompactPerLayer(MiniBatchTransformer):
     """Compact the sampled edges for a single layer."""
 
-    def __init__(self, datapipe, deduplicate):
-        super().__init__(datapipe, self._compact_per_layer)
+    def __init__(self, datapipe, deduplicate, asynchronous=False):
         self.deduplicate = deduplicate
+        if asynchronous and deduplicate:
+            datapipe = datapipe.transform(self._compact_per_layer_async)
+            datapipe = datapipe.buffer()
+            super().__init__(datapipe, self._compact_per_layer_wait_future)
+        else:
+            super().__init__(datapipe, self._compact_per_layer)
 
     def _compact_per_layer(self, minibatch):
         subgraph = minibatch.sampled_subgraphs[0]
@@ -396,6 +418,31 @@ def _compact_per_layer(self, minibatch):
         minibatch.sampled_subgraphs[0] = subgraph
         return minibatch
 
+    def _compact_per_layer_async(self, minibatch):
+        subgraph = minibatch.sampled_subgraphs[0]
+        seeds = minibatch._seed_nodes
+        assert self.deduplicate
+        minibatch._future = unique_and_compact_csc_formats(
+            subgraph.sampled_csc, seeds, async_op=True
+        )
+        return minibatch
+
+    @staticmethod
+    def _compact_per_layer_wait_future(minibatch):
+        subgraph = minibatch.sampled_subgraphs[0]
+        seeds = minibatch._seed_nodes
+        original_row_node_ids, compacted_csc_format = minibatch._future.wait()
+        delattr(minibatch, "_future")
+        subgraph = SampledSubgraphImpl(
+            sampled_csc=compacted_csc_format,
+            original_column_node_ids=seeds,
+            original_row_node_ids=original_row_node_ids,
+            original_edge_ids=subgraph.original_edge_ids,
+        )
+        minibatch._seed_nodes = original_row_node_ids
+        minibatch.sampled_subgraphs[0] = subgraph
+        return minibatch
+
 
 class NeighborSamplerImpl(SubgraphSampler):
     # pylint: disable=abstract-method
@@ -414,6 +461,7 @@ def __init__(
         overlap_fetch,
         num_gpu_cached_edges,
         gpu_cache_threshold,
+        asynchronous,
         layer_dependency=None,
         batch_dependency=None,
     ):
@@ -433,6 +481,7 @@ def __init__(
             deduplicate,
             sampler,
             overlap_fetch,
+            asynchronous,
             layer_dependency,
         )
 
@@ -508,6 +557,7 @@ def sampling_stages(
         deduplicate,
         sampler,
         overlap_fetch,
+        asynchronous,
         layer_dependency,
     ):
         datapipe = datapipe.transform(
@@ -521,9 +571,9 @@ def sampling_stages(
             if not isinstance(fanout, torch.Tensor):
                 fanout = torch.LongTensor([int(fanout)])
             datapipe = datapipe.sample_per_layer(
-                sampler, fanout, replace, prob_name, overlap_fetch
+                sampler, fanout, replace, prob_name, overlap_fetch, asynchronous
             )
-            datapipe = datapipe.compact_per_layer(deduplicate)
+            datapipe = datapipe.compact_per_layer(deduplicate, asynchronous)
             if is_labor and not layer_dependency:
                 datapipe = datapipe.transform(self._increment_seed)
         if is_labor:
@@ -589,6 +639,10 @@ class NeighborSampler(NeighborSamplerImpl):
     gpu_cache_threshold : int, optional
         Determines how many times a vertex needs to be accessed before its
         neighborhood ends up being cached on the GPU.
+    asynchronous: bool
+        Boolean indicating whether sampling and compaction stages should run
+        in background threads to hide the latency of CPU GPU synchronization.
+        Should be enabled only when sampling on the GPU.
 
     Examples
     -------
@@ -641,6 +695,7 @@ def __init__(
         overlap_fetch=False,
         num_gpu_cached_edges=0,
         gpu_cache_threshold=1,
+        asynchronous=False,
     ):
         super().__init__(
             datapipe,
@@ -653,6 +708,7 @@ def __init__(
             overlap_fetch,
             num_gpu_cached_edges,
             gpu_cache_threshold,
+            asynchronous,
         )
 
 
@@ -736,6 +792,10 @@ class LayerNeighborSampler(NeighborSamplerImpl):
     gpu_cache_threshold : int, optional
         Determines how many times a vertex needs to be accessed before its
         neighborhood ends up being cached on the GPU.
+    asynchronous: bool
+        Boolean indicating whether sampling and compaction stages should run
+        in background threads to hide the latency of CPU GPU synchronization.
+        Should be enabled only when sampling on the GPU.
 
     Examples
     -------
@@ -797,6 +857,7 @@ def __init__(
         overlap_fetch=False,
         num_gpu_cached_edges=0,
         gpu_cache_threshold=1,
+        asynchronous=False,
     ):
         super().__init__(
             datapipe,
@@ -809,6 +870,7 @@ def __init__(
             overlap_fetch,
             num_gpu_cached_edges,
             gpu_cache_threshold,
+            asynchronous,
             layer_dependency,
             batch_dependency,
         )
diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py
index aaeb4a3e8312..ba732ff20810 100644
--- a/python/dgl/graphbolt/internal/sample_utils.py
+++ b/python/dgl/graphbolt/internal/sample_utils.py
@@ -124,6 +124,7 @@ def unique_and_compact_csc_formats(
         torch.Tensor,
         Dict[str, torch.Tensor],
     ],
+    async_op: bool = False,
 ):
     """
     Compact csc formats and return unique nodes (per type).
@@ -144,6 +145,9 @@ def unique_and_compact_csc_formats(
         - If `unique_dst_nodes` is a tensor: It means the graph is homogeneous.
         - If `csc_formats` is a dictionary: The keys are node type and the
         values are corresponding nodes. And IDs inside are heterogeneous ids.
+    async_op: bool
+        Boolean indicating whether the call is asynchronous. If so, the result
+        can be obtained by calling wait on the returned future.
 
     Returns
     -------
@@ -199,8 +203,6 @@ def unique_and_compact_csc_formats(
     indices = {ntype: torch.cat(nodes) for ntype, nodes in indices.items()}
 
     ntypes = set(indices.keys())
-    unique_nodes = {}
-    compacted_indices = {}
     dtype = list(indices.values())[0].dtype
     default_tensor = torch.tensor([], dtype=dtype, device=device)
     indice_list = []
@@ -211,30 +213,56 @@ def unique_and_compact_csc_formats(
     dst_list = [torch.tensor([], dtype=dtype, device=device)] * len(
         unique_dst_list
     )
-    results = torch.ops.graphbolt.unique_and_compact_batched(
-        indice_list, dst_list, unique_dst_list
+    unique_fn = (
+        torch.ops.graphbolt.unique_and_compact_batched_async
+        if async_op
+        else torch.ops.graphbolt.unique_and_compact_batched
     )
-    for i, ntype in enumerate(ntypes):
-        unique_nodes[ntype], compacted_indices[ntype], _ = results[i]
+    results = unique_fn(indice_list, dst_list, unique_dst_list)
+
+    class _Waiter:
+        def __init__(self, future, csc_formats):
+            self.future = future
+            self.csc_formats = csc_formats
+
+        def wait(self):
+            """Returns the stored value when invoked."""
+            results = self.future.wait() if async_op else self.future
+            csc_formats = self.csc_formats
+            # Ensure there is no memory leak.
+            self.future = self.csc_formats = None
+
+            unique_nodes = {}
+            compacted_indices = {}
+            for i, ntype in enumerate(ntypes):
+                unique_nodes[ntype], compacted_indices[ntype], _ = results[i]
+
+            compacted_csc_formats = {}
+            # Map back with the same order.
+            for etype, csc_format in csc_formats.items():
+                num_elem = csc_format.indices.size(0)
+                src_type, _, _ = etype_str_to_tuple(etype)
+                indice = compacted_indices[src_type][:num_elem]
+                indptr = csc_format.indptr
+                compacted_csc_formats[etype] = CSCFormatBase(
+                    indptr=indptr, indices=indice
+                )
+                compacted_indices[src_type] = compacted_indices[src_type][
+                    num_elem:
+                ]
 
-    compacted_csc_formats = {}
-    # Map back with the same order.
-    for etype, csc_format in csc_formats.items():
-        num_elem = csc_format.indices.size(0)
-        src_type, _, _ = etype_str_to_tuple(etype)
-        indice = compacted_indices[src_type][:num_elem]
-        indptr = csc_format.indptr
-        compacted_csc_formats[etype] = CSCFormatBase(
-            indptr=indptr, indices=indice
-        )
-        compacted_indices[src_type] = compacted_indices[src_type][num_elem:]
+            # Return singleton for a homogeneous graph.
+            if is_homogeneous:
+                compacted_csc_formats = list(compacted_csc_formats.values())[0]
+                unique_nodes = list(unique_nodes.values())[0]
 
-    # Return singleton for a homogeneous graph.
-    if is_homogeneous:
-        compacted_csc_formats = list(compacted_csc_formats.values())[0]
-        unique_nodes = list(unique_nodes.values())[0]
+            return unique_nodes, compacted_csc_formats
 
-    return unique_nodes, compacted_csc_formats
+    post_processer = _Waiter(results, csc_formats)
+    if async_op:
+        return post_processer
+    else:
+        return post_processer.wait()
 
 
 def _broadcast_timestamps(csc, dst_timestamps):
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 93045c0113e9..d727fc2300fe 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -63,6 +63,7 @@ def test_DataLoader(overlap_feature_fetch):
 @pytest.mark.parametrize("enable_feature_fetch", [True, False])
 @pytest.mark.parametrize("overlap_feature_fetch", [True, False])
 @pytest.mark.parametrize("overlap_graph_fetch", [True, False])
+@pytest.mark.parametrize("asynchronous", [True, False])
 @pytest.mark.parametrize("num_gpu_cached_edges", [0, 1024])
 @pytest.mark.parametrize("gpu_cache_threshold", [1, 3])
 def test_gpu_sampling_DataLoader(
@@ -70,6 +71,7 @@ def test_gpu_sampling_DataLoader(
     enable_feature_fetch,
     overlap_feature_fetch,
     overlap_graph_fetch,
+    asynchronous,
     num_gpu_cached_edges,
     gpu_cache_threshold,
 ):
@@ -108,6 +110,7 @@ def test_gpu_sampling_DataLoader(
             "overlap_fetch": overlap_graph_fetch,
             "num_gpu_cached_edges": num_gpu_cached_edges,
             "gpu_cache_threshold": gpu_cache_threshold,
+            "asynchronous": asynchronous,
         }
         if i != 0:
             kwargs = {}
@@ -133,6 +136,8 @@ def test_gpu_sampling_DataLoader(
     if overlap_graph_fetch:
         bufferer_cnt += num_layers
         awaiter_cnt += num_layers
+    if asynchronous:
+        bufferer_cnt += 2 * num_layers
     datapipe = dataloader.dataset
     datapipe_graph = traverse_dps(datapipe)
     awaiters = find_dps(

From 0d68130f92ed97b2f7bdd24c874c7ed51f80395f Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 14 Aug 2024 20:44:16 -0400
Subject: [PATCH 18/78] [CUDA] Remove unused gpu_cache files and update.
 (#7699)

---
 CMakeLists.txt                                |   9 -
 graphbolt/CMakeLists.txt                      |  11 +-
 .../src/cuda/{ => extension}/gpu_cache.cu     |   2 +-
 .../src/cuda/{ => extension}/gpu_cache.h      |   0
 graphbolt/src/python_binding.cc               |   2 +-
 .../gpu_cache/include/gpu_cache_api.hpp       |   3 +
 .../gpu_cache/include/hash_functions.cuh      |   2 +
 .../gpu_cache/include/nv_gpu_cache.hpp        |   6 +-
 .../HugeCTR/gpu_cache/include/nv_util.h       |  11 +
 .../gpu_cache/include/static_hash_table.hpp   |  77 --
 .../gpu_cache/include/static_table.hpp        |  54 --
 .../HugeCTR/gpu_cache/include/uvm_table.hpp   | 175 -----
 .../HugeCTR/gpu_cache/src/CMakeLists.txt      |  29 -
 .../HugeCTR/gpu_cache/src/nv_gpu_cache.cu     |   2 +-
 .../gpu_cache/src/static_hash_table.cu        | 373 ---------
 .../HugeCTR/gpu_cache/src/static_table.cu     |  59 --
 .../HugeCTR/gpu_cache/src/uvm_table.cu        | 607 ---------------
 .../HugeCTR/gpu_cache/test/CMakeLists.txt     |  28 -
 .../gpu_cache/test/cache_op_sol_test.cu       | 707 ------------------
 19 files changed, 29 insertions(+), 2128 deletions(-)
 rename graphbolt/src/cuda/{ => extension}/gpu_cache.cu (99%)
 rename graphbolt/src/cuda/{ => extension}/gpu_cache.h (100%)
 delete mode 100644 third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
 delete mode 100644 third_party/HugeCTR/gpu_cache/include/static_table.hpp
 delete mode 100644 third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
 delete mode 100644 third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
 delete mode 100644 third_party/HugeCTR/gpu_cache/src/static_hash_table.cu
 delete mode 100644 third_party/HugeCTR/gpu_cache/src/static_table.cu
 delete mode 100644 third_party/HugeCTR/gpu_cache/src/uvm_table.cu
 delete mode 100644 third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
 delete mode 100644 third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 563551c4098f..688f78218b5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -512,10 +512,6 @@ if(BUILD_GRAPHBOLT)
   string(REPLACE ";" "\\;" CUDA_ARCHITECTURES_ESCAPED "${CUDA_ARCHITECTURES}")
   file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
   file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
-  if(USE_CUDA)
-    get_target_property(GPU_CACHE_INCLUDE_DIRS gpu_cache INCLUDE_DIRECTORIES)
-  endif(USE_CUDA)
-  string(REPLACE ";" "\\;" GPU_CACHE_INCLUDE_DIRS_ESCAPED "${GPU_CACHE_INCLUDE_DIRS}")
   if(MSVC)
     file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/graphbolt/build.bat BUILD_SCRIPT)
     add_custom_target(
@@ -526,7 +522,6 @@ if(BUILD_GRAPHBOLT)
       CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
       USE_CUDA=${USE_CUDA}
       BINDIR=${BINDIR}
-      GPU_CACHE_INCLUDE_DIRS="${GPU_CACHE_INCLUDE_DIRS_ESCAPED}"
       CFLAGS=${CMAKE_C_FLAGS}
       CXXFLAGS=${CMAKE_CXX_FLAGS}
       CUDAARCHS="${CUDA_ARCHITECTURES_ESCAPED}"
@@ -545,7 +540,6 @@ if(BUILD_GRAPHBOLT)
       USE_CUDA=${USE_CUDA}
       USE_LIBURING=${USE_LIBURING}
       BINDIR=${CMAKE_CURRENT_BINARY_DIR}
-      GPU_CACHE_INCLUDE_DIRS="${GPU_CACHE_INCLUDE_DIRS_ESCAPED}"
       CFLAGS=${CMAKE_C_FLAGS}
       CXXFLAGS=${CMAKE_CXX_FLAGS}
       CUDAARCHS="${CUDA_ARCHITECTURES_ESCAPED}"
@@ -554,7 +548,4 @@ if(BUILD_GRAPHBOLT)
       DEPENDS ${BUILD_SCRIPT}
       WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/graphbolt)
   endif(MSVC)
-  if(USE_CUDA)
-    add_dependencies(graphbolt gpu_cache)
-  endif(USE_CUDA)
 endif(BUILD_GRAPHBOLT)
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
index afac8b8e9ba4..60b1038a1259 100644
--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -118,6 +118,7 @@ if(USE_CUDA)
   file(GLOB BOLT_CUDA_EXTENSION_SRC
     ${BOLT_DIR}/cuda/extension/*.cu
     ${BOLT_DIR}/cuda/extension/*.cc
+    ../third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
   )
   # Until https://github.com/NVIDIA/cccl/issues/1083 is resolved, we need to
   # compile the cuda/extension folder with Volta+ CUDA architectures.
@@ -128,16 +129,16 @@ if(USE_CUDA)
   set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_STANDARD 17)
   set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES_FILTERED}")
   set_target_properties(${LIB_GRAPHBOLT_CUDA_NAME} PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  # Enables libcudacxx for gpu_cache. 
+  target_compile_definitions(${LIB_GRAPHBOLT_CUDA_NAME} PRIVATE LIBCUDACXX_VERSION)
+  include_directories(AFTER "../third_party/HugeCTR/gpu_cache/include")
+  message(STATUS "Build graphbolt extension with HugeCTR GPU embedding cache.")
+
   message(STATUS "Use external CCCL library for a consistent API and performance for graphbolt.")
   include_directories(BEFORE
                       "../third_party/cccl/thrust"
                       "../third_party/cccl/cub"
                       "../third_party/cuco/include")
-
-  message(STATUS "Use HugeCTR gpu_cache for graphbolt with INCLUDE_DIRS $ENV{GPU_CACHE_INCLUDE_DIRS}.")
-  target_include_directories(${LIB_GRAPHBOLT_NAME} PRIVATE $ENV{GPU_CACHE_INCLUDE_DIRS})
-  target_link_directories(${LIB_GRAPHBOLT_NAME} PRIVATE ${GPU_CACHE_BUILD_DIR})
-  target_link_libraries(${LIB_GRAPHBOLT_NAME} gpu_cache)
   
   get_property(archs TARGET ${LIB_GRAPHBOLT_NAME} PROPERTY CUDA_ARCHITECTURES)
   message(STATUS "CUDA_ARCHITECTURES for graphbolt: ${archs}")
diff --git a/graphbolt/src/cuda/gpu_cache.cu b/graphbolt/src/cuda/extension/gpu_cache.cu
similarity index 99%
rename from graphbolt/src/cuda/gpu_cache.cu
rename to graphbolt/src/cuda/extension/gpu_cache.cu
index 710aadf90ba7..8abe5eec71f5 100644
--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_cache.cu
@@ -19,7 +19,7 @@
  */
 #include <numeric>
 
-#include "./common.h"
+#include "../common.h"
 #include "./gpu_cache.h"
 
 namespace graphbolt {
diff --git a/graphbolt/src/cuda/gpu_cache.h b/graphbolt/src/cuda/extension/gpu_cache.h
similarity index 100%
rename from graphbolt/src/cuda/gpu_cache.h
rename to graphbolt/src/cuda/extension/gpu_cache.h
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index aa7101b70df2..9e017dd1df3d 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -22,8 +22,8 @@
 #include "./utils.h"
 
 #ifdef GRAPHBOLT_USE_CUDA
+#include "./cuda/extension/gpu_cache.h"
 #include "./cuda/extension/gpu_graph_cache.h"
-#include "./cuda/gpu_cache.h"
 #endif
 
 namespace graphbolt {
diff --git a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
index 594b1895a12e..563aa5b5b75f 100644
--- a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
@@ -47,6 +47,9 @@ class gpu_cache_api {
   // Dump API, i.e. dump some slabsets' keys from the cache
   virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
                     const size_t end_set_index, cudaStream_t stream) = 0;
+
+  // Record all the lookup stream of a specific cache for Update/Replace sync
+  virtual void Record(cudaStream_t stream) = 0;
 };
 
 }  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/include/hash_functions.cuh b/third_party/HugeCTR/gpu_cache/include/hash_functions.cuh
index 7facc897a8e1..95d56cccb07f 100644
--- a/third_party/HugeCTR/gpu_cache/include/hash_functions.cuh
+++ b/third_party/HugeCTR/gpu_cache/include/hash_functions.cuh
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cstdint>
+
 // MurmurHash3_32 implementation from
 // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
 //-----------------------------------------------------------------------------
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
index 65e34a5f3b94..7cc61b58d78f 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
@@ -23,8 +23,8 @@
 
 #include "gpu_cache_api.hpp"
 #ifdef LIBCUDACXX_VERSION
-#include <cuda/std/atomic>
-#include <cuda/std/semaphore>
+#include <cuda/atomic>
+#include <cuda/semaphore>
 #endif
 
 #define SET_ASSOCIATIVITY 2
@@ -76,6 +76,8 @@ class gpu_cache : public gpu_cache_api<key_type> {
   void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
             const size_t end_set_index, cudaStream_t stream) override;
 
+  void Record(cudaStream_t stream) override {}
+
  public:
   using slabset = slab_set<set_associativity, key_type, warp_size>;
 #ifdef LIBCUDACXX_VERSION
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_util.h b/third_party/HugeCTR/gpu_cache/include/nv_util.h
index 315210bbb39b..f67ad6be2daf 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_util.h
+++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
 #include <cuda_runtime_api.h>
 
 #include <stdexcept>
@@ -25,6 +27,15 @@
 
 namespace nv {
 
+template <typename T>
+struct is_fp8 : std::false_type {};
+
+template <>
+struct is_fp8<__nv_fp8_e4m3> : std::true_type {};
+
+template <>
+struct is_fp8<__nv_fp8_e5m2> : std::true_type {};
+
 class CudaException : public std::runtime_error {
  public:
   CudaException(const std::string& what) : runtime_error(what) {}
diff --git a/third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp b/third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
deleted file mode 100644
index c34a4d8b4e62..000000000000
--- a/third_party/HugeCTR/gpu_cache/include/static_hash_table.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <nv_util.h>
-
-#include <hash_functions.cuh>
-
-namespace gpu_cache {
-template <typename key_type, typename value_type, unsigned int tile_size = 4,
-          unsigned int group_size = 16, typename hasher = MurmurHash3_32<key_type>>
-class StaticHashTable {
- public:
-  using size_type = uint32_t;
-  static_assert(sizeof(key_type) <= 8, "sizeof(key_type) cannot be larger than 8 bytes");
-  static_assert(sizeof(key_type) >= sizeof(size_type),
-                "sizeof(key_type) cannot be smaller than sizeof(size_type)");
-  static_assert((group_size & (group_size - 1)) == 0, "group_size must be a power of 2");
-  static_assert(group_size > 1, "group_size must be larger than 1");
-  // User can use empty_key as input without affecting correctness,
-  // since we will handle it inside kernel.
-  constexpr static key_type empty_key = ~(key_type)0;
-  constexpr static size_type invalid_slot = ~(size_type)0;
-
- public:
-  StaticHashTable(size_type capacity, int value_dim = 1, hasher hash = hasher{});
-  ~StaticHashTable();
-
-  inline size_type size() const { return size_; }
-  inline size_type capacity() const { return value_capacity_; }
-  inline size_type key_capacity() const { return key_capacity_; }
-
-  inline size_t memory_usage() const {
-    size_t keys_bytes = sizeof(key_type) * (key_capacity_ + 1);
-    size_t indices_bytes = sizeof(size_type) * (key_capacity_ + 1);
-    size_t values_bytes = sizeof(value_type) * value_capacity_ * value_dim_;
-    return keys_bytes + indices_bytes + values_bytes;
-  }
-
-  void clear(cudaStream_t stream = 0);
-
-  // Note:
-  // 1. Please make sure the key to be inserted is not duplicated.
-  // 2. Please make sure the key to be inserted does not exist in the table.
-  // 3. Please make sure (size() + num_keys) <= capacity().
-  void insert(const key_type *keys, const value_type *values, size_type num_keys,
-              cudaStream_t stream = 0);
-
-  void lookup(const key_type *keys, value_type *values, int num_keys, value_type default_value = 0,
-              cudaStream_t stream = 0);
-
- private:
-  key_type *table_keys_;
-  size_type *table_indices_;
-  size_type key_capacity_;
-
-  value_type *table_values_;
-  size_type value_capacity_;
-  int value_dim_;
-
-  size_type size_;
-  hasher hash_;
-};
-}  // namespace gpu_cache
\ No newline at end of file
diff --git a/third_party/HugeCTR/gpu_cache/include/static_table.hpp b/third_party/HugeCTR/gpu_cache/include/static_table.hpp
deleted file mode 100644
index 6e0a4480fd0b..000000000000
--- a/third_party/HugeCTR/gpu_cache/include/static_table.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <nv_util.h>
-
-#include <cstdio>
-#include <limits>
-#include <static_hash_table.hpp>
-
-namespace gpu_cache {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename key_type>
-class static_table {
- public:
-  // Ctor
-  static_table(const size_t table_size, const size_t embedding_vec_size,
-               const float default_value = 0);
-
-  // Dtor
-  ~static_table(){};
-
-  // Query API, i.e. A single read from the cache
-  void Query(const key_type* d_keys, const size_t len, float* d_values, cudaStream_t stream);
-
-  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
-  void Init(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream);
-
-  void Clear(cudaStream_t stream);
-
- private:
-  StaticHashTable<key_type, float> static_hash_table_;
-  // Embedding vector size
-  size_t embedding_vec_size_;
-  size_t table_size_;
-  float default_value_;
-};
-
-}  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/include/uvm_table.hpp b/third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
deleted file mode 100644
index 53ce9d52f1f0..000000000000
--- a/third_party/HugeCTR/gpu_cache/include/uvm_table.hpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <nv_util.h>
-
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-namespace gpu_cache {
-
-template <typename key_type, typename index_type>
-class HashBlock {
- public:
-  key_type* keys;
-  size_t num_sets;
-  size_t capacity;
-
-  HashBlock(size_t expected_capacity, int set_size, int batch_size);
-  ~HashBlock();
-  void add(const key_type* new_keys, const size_t num_keys, key_type* missing_keys,
-           int* num_missing_keys, cudaStream_t stream);
-  void query(const key_type* query_keys, const size_t num_keys, index_type* output_indices,
-             key_type* missing_keys, int* missing_positions, int* num_missing_keys,
-             cudaStream_t stream);
-  void query(const key_type* query_keys, int* num_keys, index_type* output_indices,
-             cudaStream_t stream);
-  void clear(cudaStream_t stream);
-
- private:
-  int max_set_size_;
-  int batch_size_;
-  int* set_sizes_;
-};
-
-template <typename vec_type>
-class H2HCopy {
- public:
-  H2HCopy(int num_threads) : num_threads_(num_threads), working_(num_threads) {
-    for (int i = 0; i < num_threads_; i++) {
-      threads_.emplace_back(
-          [&](int idx) {
-            while (!terminate_) {
-              if (working_[idx].load(std::memory_order_relaxed)) {
-                working_[idx].store(false, std::memory_order_relaxed);
-                if (num_keys_ == 0) continue;
-                size_t num_keys_this_thread = (num_keys_ - 1) / num_threads_ + 1;
-                size_t begin = idx * num_keys_this_thread;
-                if (idx == num_threads_ - 1) {
-                  num_keys_this_thread = num_keys_ - num_keys_this_thread * idx;
-                }
-                size_t end = begin + num_keys_this_thread;
-
-                for (size_t i = begin; i < end; i++) {
-                  size_t idx_vec = get_index_(i);
-                  if (idx_vec == std::numeric_limits<size_t>::max()) {
-                    continue;
-                  }
-                  memcpy(dst_data_ptr_ + i * vec_size_, src_data_ptr_ + idx_vec * vec_size_,
-                         sizeof(vec_type) * vec_size_);
-                }
-                num_finished_workers_++;
-              }
-            }
-            std::this_thread::sleep_for(std::chrono::microseconds(1));
-          },
-          i);
-    }
-  };
-
-  void copy(vec_type* dst_data_ptr, vec_type* src_data_ptr, size_t num_keys, int vec_size,
-            std::function<size_t(size_t)> get_index_func) {
-    std::lock_guard<std::mutex> guard(submit_mutex_);
-    dst_data_ptr_ = dst_data_ptr;
-    src_data_ptr_ = src_data_ptr;
-    get_index_ = get_index_func;
-    num_keys_ = num_keys;
-    vec_size_ = vec_size;
-    num_finished_workers_.store(0, std::memory_order_acquire);
-
-    for (auto& working : working_) {
-      working.store(true, std::memory_order_relaxed);
-    }
-
-    while (num_finished_workers_ != num_threads_) {
-      continue;
-    }
-  }
-
-  ~H2HCopy() {
-    terminate_ = true;
-    for (auto& t : threads_) {
-      t.join();
-    }
-  }
-
- private:
-  vec_type* src_data_ptr_;
-  vec_type* dst_data_ptr_;
-
-  std::function<size_t(size_t)> get_index_;
-
-  size_t num_keys_;
-  int vec_size_;
-
-  std::mutex submit_mutex_;
-  const int num_threads_;
-  std::vector<std::thread> threads_;
-  std::vector<std::atomic<bool>> working_;
-  volatile bool terminate_{false};
-  std::atomic<int> num_finished_workers_{0};
-};
-
-template <typename key_type, typename index_type, typename vec_type = float>
-class UvmTable {
- public:
-  UvmTable(const size_t device_table_capacity, const size_t host_table_capacity,
-           const int max_batch_size, const int vec_size,
-           const vec_type default_value = (vec_type)0);
-  ~UvmTable();
-  void query(const key_type* d_keys, const int len, vec_type* d_vectors, cudaStream_t stream = 0);
-  void add(const key_type* h_keys, const vec_type* h_vectors, const size_t len);
-  void clear(cudaStream_t stream = 0);
-
- private:
-  static constexpr int num_buffers_ = 2;
-  key_type* d_keys_buffer_;
-  vec_type* d_vectors_buffer_;
-  vec_type* d_vectors_;
-
-  index_type* d_output_indices_;
-  index_type* d_output_host_indices_;
-  index_type* h_output_host_indices_;
-
-  key_type* d_missing_keys_;
-  int* d_missing_positions_;
-  int* d_missing_count_;
-
-  std::vector<vec_type> h_vectors_;
-  key_type* h_missing_keys_;
-
-  cudaStream_t query_stream_;
-  cudaEvent_t query_event_;
-
-  vec_type* h_cpy_buffers_[num_buffers_];
-  vec_type* d_cpy_buffers_[num_buffers_];
-  cudaStream_t cpy_streams_[num_buffers_];
-  cudaEvent_t cpy_events_[num_buffers_];
-
-  std::unordered_map<key_type, index_type> h_final_missing_items_;
-
-  int max_batch_size_;
-  int vec_size_;
-  size_t num_set_;
-  size_t num_host_set_;
-  size_t table_capacity_;
-  std::vector<vec_type> default_vector_;
-
-  HashBlock<key_type, index_type> device_table_;
-  HashBlock<key_type, index_type> host_table_;
-};
-}  // namespace gpu_cache
\ No newline at end of file
diff --git a/third_party/HugeCTR/gpu_cache/src/CMakeLists.txt b/third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
deleted file mode 100644
index 685daccf0bb0..000000000000
--- a/third_party/HugeCTR/gpu_cache/src/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#      http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-cmake_minimum_required(VERSION 3.8)
-file(GLOB gpu_cache_src
-  nv_gpu_cache.cu
-  static_table.cu
-  static_hash_table.cu
-  uvm_table.cu
-)
-
-add_library(gpu_cache SHARED ${gpu_cache_src})
-target_compile_features(gpu_cache PUBLIC cxx_std_11)
-set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(gpu_cache PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(gpu_cache PROPERTIES CUDA_ARCHITECTURES OFF)
-
diff --git a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
index 0a7623f6dd42..8dc21395e329 100644
--- a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
+++ b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
@@ -20,7 +20,7 @@
 
 namespace cg = cooperative_groups;
 
-// Overload CUDA atomic for other 64bit unsinged/signed integer type
+// Overload CUDA atomic for other 64bit unsigned/signed integer type
 __forceinline__ __device__ long atomicAdd(long* address, long val) {
   return (long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
 }
diff --git a/third_party/HugeCTR/gpu_cache/src/static_hash_table.cu b/third_party/HugeCTR/gpu_cache/src/static_hash_table.cu
deleted file mode 100644
index 9e73e65c5a1a..000000000000
--- a/third_party/HugeCTR/gpu_cache/src/static_hash_table.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cooperative_groups.h>
-#include <cuda.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <static_hash_table.hpp>
-
-namespace gpu_cache {
-
-template <typename T>
-__device__ __forceinline__ T atomicCASHelper(T *address, T compare, T val) {
-  return atomicCAS(address, compare, val);
-}
-
-template <>
-__device__ __forceinline__ long long atomicCASHelper(long long *address, long long compare,
-                                                     long long val) {
-  return (long long)atomicCAS((unsigned long long *)address, (unsigned long long)compare,
-                              (unsigned long long)val);
-}
-
-template <>
-__device__ __forceinline__ int64_t atomicCASHelper(int64_t *address, int64_t compare, int64_t val) {
-  return (int64_t)atomicCAS((unsigned long long *)address, (unsigned long long)compare,
-                            (unsigned long long)val);
-}
-
-template <unsigned int group_size, typename key_type, typename size_type, typename hasher,
-          typename CG>
-__device__ size_type insert(key_type *table, size_type capacity, key_type key, const hasher &hash,
-                            const CG &cg, const key_type empty_key, const size_type invalid_slot) {
-  // If insert successfully, return its position in the table,
-  // otherwise return invalid_slot.
-
-  const size_type num_groups = capacity / group_size;
-#if (CUDA_VERSION < 11060)
-  unsigned long long num_threads_per_group = cg.size();
-#else
-  unsigned long long num_threads_per_group = cg.num_threads();
-#endif
-  const unsigned int num_tiles_per_group = group_size / num_threads_per_group;
-
-  // Assuming capacity is a power of 2
-  size_type slot = hash(key) & (capacity - 1);
-  slot = slot - (slot & (size_type)(group_size - 1)) + cg.thread_rank();
-
-  for (size_type step = 0; step < num_groups; ++step) {
-    for (unsigned int i = 0; i < num_tiles_per_group; ++i) {
-      key_type existed_key = table[slot];
-
-      // Check if key already exists
-      bool existed = cg.any(existed_key == key);
-      if (existed) {
-        return invalid_slot;
-      }
-
-      // Try to insert the target key into empty slot
-      while (true) {
-        int can_insert = cg.ballot(existed_key == empty_key);
-
-        if (!can_insert) {
-          break;
-        }
-
-        bool succeed = false;
-        int src_lane = __ffs(can_insert) - 1;
-
-        if (cg.thread_rank() == src_lane) {
-          key_type old = atomicCASHelper(table + slot, empty_key, key);
-          if (old == empty_key) {
-            // Insert key successfully
-            succeed = true;
-          } else if (old == key) {
-            // The target key was inserted by another thread
-            succeed = true;
-            slot = invalid_slot;
-          } else {
-            // The empty slot was occupied by another key,
-            // update the existed_key for next loop.
-            existed_key = old;
-          }
-        }
-
-        succeed = cg.shfl(succeed, src_lane);
-        if (succeed) {
-          slot = cg.shfl(slot, src_lane);
-          return slot;
-        }
-      }
-
-      slot += num_threads_per_group;
-    }
-    slot = (slot + group_size * step) & (capacity - 1);
-  }
-
-  return invalid_slot;
-}
-
-template <unsigned int tile_size, unsigned int group_size, typename key_type, typename size_type,
-          typename hasher>
-__global__ void InsertKeyKernel(key_type *table_keys, size_type *table_indices, size_type capacity,
-                                const key_type *keys, size_type num_keys, size_type offset,
-                                hasher hash, const key_type empty_key,
-                                const size_type invalid_slot) {
-  static_assert(tile_size <= group_size, "tile_size cannot be larger than group_size");
-
-  auto block = cooperative_groups::this_thread_block();
-  auto tile = cooperative_groups::tiled_partition<tile_size>(block);
-
-  int tile_idx = tile.meta_group_size() * block.group_index().x + tile.meta_group_rank();
-  int tile_cnt = tile.meta_group_size() * gridDim.x;
-
-  for (size_type i = tile_idx; i < num_keys; i += tile_cnt) {
-    key_type key = keys[i];
-    if (key == empty_key) {
-      if (tile.thread_rank() == 0 && table_keys[capacity] != empty_key) {
-        table_keys[capacity] = empty_key;
-        table_indices[capacity] = i + offset;
-      }
-      continue;
-    }
-    size_type slot =
-        insert<group_size>(table_keys, capacity, key, hash, tile, empty_key, invalid_slot);
-    if (tile.thread_rank() == 0 && slot != invalid_slot) {
-      table_indices[slot] = i + offset;
-    }
-  }
-}
-
-template <unsigned int group_size, typename key_type, typename size_type, typename hasher,
-          typename CG>
-__device__ size_type lookup(key_type *table, size_type capacity, key_type key, const hasher &hash,
-                            const CG &cg, const key_type empty_key, const size_type invalid_slot) {
-  // If lookup successfully, return the target key's position in the table,
-  // otherwise return invalid_slot.
-
-  const size_type num_groups = capacity / group_size;
-
-#if (CUDA_VERSION < 11060)
-  unsigned long long num_threads_per_group = cg.size();
-#else
-  unsigned long long num_threads_per_group = cg.num_threads();
-#endif
-
-  const unsigned int num_tiles_per_group = group_size / num_threads_per_group;
-
-  // Assuming capacity is a power of 2
-  size_type slot = hash(key) & (capacity - 1);
-  slot = slot - (slot & (size_type)(group_size - 1)) + cg.thread_rank();
-
-  for (size_type step = 0; step < num_groups; ++step) {
-    for (unsigned int i = 0; i < num_tiles_per_group; ++i) {
-      key_type existed_key = table[slot];
-
-      // Check if key exists
-      int existed = cg.ballot(existed_key == key);
-      if (existed) {
-        int src_lane = __ffs(existed) - 1;
-        slot = cg.shfl(slot, src_lane);
-        return slot;
-      }
-
-      // The target key doesn't exist
-      bool contain_empty = cg.any(existed_key == empty_key);
-      if (contain_empty) {
-        return invalid_slot;
-      }
-
-      slot += num_threads_per_group;
-    }
-    slot = (slot + group_size * step) & (capacity - 1);
-  }
-
-  return invalid_slot;
-}
-
-template <int warp_size>
-__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
-                                               const size_t emb_vec_size_in_float,
-                                               volatile float *d_dst, const float *d_src) {
-  // 16 bytes align
-  if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
-      d_dst[i] = d_src[i];
-    }
-  } else {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
-      *(float4 *)(d_dst + i * 4) = __ldg((const float4 *)(d_src + i * 4));
-    }
-  }
-}
-
-template <int warp_size>
-__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
-                                               const size_t emb_vec_size_in_float,
-                                               volatile float *d_dst, const float default_value) {
-#pragma unroll
-  for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
-    d_dst[i] = default_value;
-  }
-}
-
-template <unsigned int tile_size, unsigned int group_size, typename key_type, typename value_type,
-          typename size_type, typename hasher>
-__global__ void LookupKernel(key_type *table_keys, size_type *table_indices, size_type capacity,
-                             const key_type *keys, int num_keys, const value_type *values,
-                             int value_dim, value_type *output, hasher hash,
-                             const key_type empty_key, const value_type default_value,
-                             const size_type invalid_slot) {
-  static_assert(tile_size <= group_size, "tile_size cannot be larger than group_size");
-  constexpr int WARP_SIZE = 32;
-  static_assert(WARP_SIZE % tile_size == 0, "tile_size must be divisible by warp_size");
-
-  auto grid = cooperative_groups::this_grid();
-  auto block = cooperative_groups::this_thread_block();
-  auto tile = cooperative_groups::tiled_partition<tile_size>(block);
-  auto warp_tile = cooperative_groups::tiled_partition<WARP_SIZE>(block);
-
-  int tile_idx = tile.meta_group_size() * block.group_index().x + tile.meta_group_rank();
-  int tile_cnt = tile.meta_group_size() * gridDim.x;
-
-  for (int it = 0; it < (num_keys - 1) / tile_cnt + 1; it++) {
-    size_type slot = invalid_slot;
-    int key_num = it * tile_cnt + tile_idx;
-    if (key_num < num_keys) {
-      key_type key = keys[key_num];
-      if (key == empty_key) {
-        if (tile.thread_rank() == 0 && table_keys[capacity] == key) {
-          slot = capacity;
-        }
-      } else {
-        slot = lookup<group_size>(table_keys, capacity, key, hash, tile, empty_key, invalid_slot);
-      }
-    }
-    for (int i = 0; i < WARP_SIZE / tile_size; i++) {
-      auto slot_to_read = warp_tile.shfl(slot, i * tile_size);
-      int idx_to_write = warp_tile.shfl(key_num, 0) + i;
-      if (idx_to_write >= num_keys) break;
-      if (slot_to_read == invalid_slot) {
-        warp_tile_copy<WARP_SIZE>(warp_tile.thread_rank(), value_dim,
-                                  output + (size_t)value_dim * idx_to_write, default_value);
-        continue;
-      }
-      auto index = table_indices[slot_to_read];
-      warp_tile_copy<WARP_SIZE>(warp_tile.thread_rank(), value_dim,
-                                output + (size_t)value_dim * idx_to_write,
-                                values + (size_t)value_dim * index);
-    }
-  }
-}
-
-template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
-          typename hasher>
-StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::StaticHashTable(
-    size_type capacity, int value_dim, hasher hash)
-    : table_keys_(nullptr),
-      table_indices_(nullptr),
-      key_capacity_(capacity * 2),
-      table_values_(nullptr),
-      value_capacity_(capacity),
-      value_dim_(value_dim),
-      size_(0),
-      hash_(hash) {
-  // Check parameters
-  if (capacity <= 0) {
-    printf("Error: capacity must be larger than 0\n");
-    exit(EXIT_FAILURE);
-  }
-  if (value_dim <= 0) {
-    printf("Error: value_dim must be larger than 0\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Make key_capacity_ be a power of 2
-  size_t new_capacity = group_size;
-  while (new_capacity < key_capacity_) {
-    new_capacity *= 2;
-  }
-  key_capacity_ = new_capacity;
-
-  // Allocate device memory
-  size_t align_m = 16;
-  size_t num_keys = key_capacity_ + 1;
-  size_t num_values = (value_capacity_ * value_dim_ + align_m - 1) / align_m * align_m;
-  CUDA_CHECK(cudaMalloc(&table_keys_, sizeof(key_type) * num_keys));
-  CUDA_CHECK(cudaMalloc(&table_indices_, sizeof(size_type) * num_keys));
-  CUDA_CHECK(cudaMalloc(&table_values_, sizeof(value_type) * num_values));
-
-  // Initialize table_keys_
-  CUDA_CHECK(cudaMemset(table_keys_, 0xff, sizeof(key_type) * key_capacity_));
-  CUDA_CHECK(cudaMemset(table_keys_ + key_capacity_, 0, sizeof(key_type)));
-}
-
-template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
-          typename hasher>
-void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::insert(
-    const key_type *keys, const value_type *values, size_type num_keys, cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-  if (num_keys <= 0 || (size() + num_keys) > capacity()) {
-    printf("Error: Invalid num_keys to insert\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Insert keys
-  constexpr int block = 256;
-  int grid = (num_keys - 1) / block + 1;
-  InsertKeyKernel<tile_size, group_size>
-      <<<grid, block, 0, stream>>>(table_keys_, table_indices_, key_capacity_, keys, num_keys,
-                                   size_, hash_, empty_key, invalid_slot);
-  // Copy values
-  CUDA_CHECK(cudaMemcpyAsync(table_values_ + size_ * value_dim_, values,
-                             sizeof(value_type) * num_keys * value_dim_, cudaMemcpyDeviceToDevice,
-                             stream));
-  size_ += num_keys;
-}
-
-template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
-          typename hasher>
-void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::clear(
-    cudaStream_t stream) {
-  CUDA_CHECK(cudaMemsetAsync(table_keys_, 0xff, sizeof(key_type) * key_capacity_, stream));
-  CUDA_CHECK(cudaMemsetAsync(table_keys_ + key_capacity_, 0, sizeof(key_type), stream));
-  size_ = 0;
-}
-
-template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
-          typename hasher>
-StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::~StaticHashTable() {
-  CUDA_CHECK(cudaFree(table_keys_));
-  CUDA_CHECK(cudaFree(table_indices_));
-  CUDA_CHECK(cudaFree(table_values_));
-}
-
-template <typename key_type, typename value_type, unsigned int tile_size, unsigned int group_size,
-          typename hasher>
-void StaticHashTable<key_type, value_type, tile_size, group_size, hasher>::lookup(
-    const key_type *keys, value_type *values, int num_keys, value_type default_value,
-    cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-
-  constexpr int block = 256;
-  const int grid = (num_keys - 1) / block + 1;
-  // Lookup keys
-  LookupKernel<tile_size, group_size><<<grid, block, 0, stream>>>(
-      table_keys_, table_indices_, key_capacity_, keys, num_keys, table_values_, value_dim_, values,
-      hash_, empty_key, default_value, invalid_slot);
-}
-
-template class StaticHashTable<long long, float>;
-template class StaticHashTable<uint32_t, float>;
-}  // namespace gpu_cache
\ No newline at end of file
diff --git a/third_party/HugeCTR/gpu_cache/src/static_table.cu b/third_party/HugeCTR/gpu_cache/src/static_table.cu
deleted file mode 100644
index f5fc049856b2..000000000000
--- a/third_party/HugeCTR/gpu_cache/src/static_table.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cooperative_groups.h>
-#include <nv_util.h>
-
-#include <iostream>
-#include <static_hash_table.hpp>
-#include <static_table.hpp>
-
-namespace gpu_cache {
-
-template <typename key_type>
-static_table<key_type>::static_table(const size_t table_size, const size_t embedding_vec_size,
-                                     const float default_value)
-    : table_size_(table_size),
-      embedding_vec_size_(embedding_vec_size),
-      default_value_(default_value),
-      static_hash_table_(table_size, embedding_vec_size) {
-  if (embedding_vec_size_ == 0) {
-    printf("Error: Invalid value for embedding_vec_size.\n");
-    return;
-  }
-}
-
-template <typename key_type>
-void static_table<key_type>::Query(const key_type* d_keys, const size_t len, float* d_values,
-                                   cudaStream_t stream) {
-  static_hash_table_.lookup(d_keys, d_values, len, default_value_, stream);
-}
-
-template <typename key_type>
-void static_table<key_type>::Init(const key_type* d_keys, const size_t len, const float* d_values,
-                                  cudaStream_t stream) {
-  static_hash_table_.insert(d_keys, d_values, len, stream);
-}
-
-template <typename key_type>
-void static_table<key_type>::Clear(cudaStream_t stream) {
-  static_hash_table_.clear(stream);
-}
-
-template class static_table<unsigned int>;
-template class static_table<long long>;
-
-}  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/src/uvm_table.cu b/third_party/HugeCTR/gpu_cache/src/uvm_table.cu
deleted file mode 100644
index 641c816b7deb..000000000000
--- a/third_party/HugeCTR/gpu_cache/src/uvm_table.cu
+++ /dev/null
@@ -1,607 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cooperative_groups.h>
-#include <cuda_runtime_api.h>
-#include <immintrin.h>
-
-#include <atomic>
-#include <iostream>
-#include <limits>
-#include <mutex>
-#include <uvm_table.hpp>
-
-namespace cg = cooperative_groups;
-
-namespace {
-
-constexpr int set_size = 4;
-constexpr int block_size = 256;
-
-template <typename key_type>
-__host__ __device__ key_type hash(key_type key) {
-  return key;
-}
-
-template <typename key_type>
-__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
-                                const int num_sets, int* set_sizes, const int max_set_size,
-                                key_type* missing_keys, int* num_missing_keys) {
-  __shared__ key_type s_missing_keys[block_size];
-  __shared__ int s_missing_count;
-  __shared__ size_t s_missing_idx;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-
-  if (block.thread_rank() == 0) {
-    s_missing_count = 0;
-  }
-  block.sync();
-
-  size_t idx = grid.thread_rank();
-  if (idx < num_keys) {
-    auto key = new_keys[idx];
-    size_t idx_set = hash(key) % num_sets;
-    int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
-    if (prev_set_size < max_set_size) {
-      keys[idx_set * max_set_size + prev_set_size] = key;
-    } else {
-      int count = atomicAdd(&s_missing_count, 1);
-      s_missing_keys[count] = key;
-    }
-  }
-
-  block.sync();
-  if (block.thread_rank() == 0) {
-    s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
-  }
-  block.sync();
-  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
-    missing_keys[s_missing_idx + i] = s_missing_keys[i];
-  }
-}
-
-template <typename key_type, typename index_type>
-__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
-                                  const key_type* keys, const size_t num_sets,
-                                  const int max_set_size, index_type* output_indices) {
-  constexpr int tile_size = set_size;
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<tile_size>(block);
-  int num_keys = *num_keys_ptr;
-  if (num_keys == 0) return;
-
-#if (CUDA_VERSION < 11060)
-  size_t num_threads_per_grid = grid.size();
-#else
-  size_t num_threads_per_grid = grid.num_threads();
-#endif
-
-  size_t step = (num_keys - 1) / num_threads_per_grid + 1;
-  for (size_t i = 0; i < step; i++) {
-    size_t idx = i * num_threads_per_grid + grid.thread_rank();
-    key_type query_key = std::numeric_limits<key_type>::max();
-    if (idx < num_keys) {
-      query_key = query_keys[idx];
-    }
-    auto idx_set = hash(query_key) % num_sets;
-    for (int j = 0; j < tile_size; j++) {
-      auto current_idx_set = tile.shfl(idx_set, j);
-      auto current_query_key = tile.shfl(query_key, j);
-      if (current_query_key == std::numeric_limits<key_type>::max()) {
-        continue;
-      }
-      auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
-      int existed = tile.ballot(current_query_key == candidate_key);
-      auto current_idx = tile.shfl(idx, 0) + j;
-      if (existed) {
-        int src_lane = __ffs(existed) - 1;
-        size_t found_idx = current_idx_set * set_size + src_lane;
-        output_indices[current_idx] = num_sets * src_lane + current_idx_set;
-      } else {
-        output_indices[current_idx] = std::numeric_limits<index_type>::max();
-      }
-    }
-  }
-}
-
-template <typename key_type, typename index_type>
-__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
-                                  const key_type* keys, const size_t num_sets,
-                                  const int max_set_size, index_type* output_indices,
-                                  key_type* missing_keys, int* missing_positions,
-                                  int* missing_count) {
-  __shared__ key_type s_missing_keys[block_size];
-  __shared__ key_type s_missing_positions[block_size];
-  __shared__ int s_missing_count;
-  __shared__ int s_missing_idx;
-
-  constexpr int tile_size = set_size;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<tile_size>(block);
-
-  if (block.thread_rank() == 0) {
-    s_missing_count = 0;
-  }
-  block.sync();
-
-  size_t idx = grid.thread_rank();
-  key_type query_key = std::numeric_limits<key_type>::max();
-  if (idx < num_keys) {
-    query_key = query_keys[idx];
-  }
-  auto idx_set = hash(query_key) % num_sets;
-
-  for (int j = 0; j < tile_size; j++) {
-    auto current_idx_set = tile.shfl(idx_set, j);
-    auto current_query_key = tile.shfl(query_key, j);
-    if (current_query_key == std::numeric_limits<key_type>::max()) {
-      continue;
-    }
-    auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
-    int existed = tile.ballot(current_query_key == candidate_key);
-    if (existed) {
-      int src_lane = __ffs(existed) - 1;
-      size_t found_idx = current_idx_set * set_size + src_lane;
-      output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
-    } else {
-      auto current_idx = tile.shfl(idx, 0) + j;
-      output_indices[current_idx] = std::numeric_limits<index_type>::max();
-      if (tile.thread_rank() == 0) {
-        int s_count = atomicAdd(&s_missing_count, 1);
-        s_missing_keys[s_count] = current_query_key;
-        s_missing_positions[s_count] = current_idx;
-      }
-    }
-  }
-
-  if (missing_keys == nullptr) {
-    if (grid.thread_rank() == 0 && missing_count) {
-      *missing_count = 0;
-    }
-    return;
-  }
-  block.sync();
-  if (block.thread_rank() == 0) {
-    s_missing_idx = atomicAdd(missing_count, s_missing_count);
-  }
-  block.sync();
-  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
-    missing_keys[s_missing_idx + i] = s_missing_keys[i];
-    missing_positions[s_missing_idx + i] = s_missing_positions[i];
-  }
-}
-
-template <int warp_size>
-__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
-                                               const size_t emb_vec_size_in_float,
-                                               volatile float* d_dst, const float* d_src) {
-  // 16 bytes align
-  if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
-      d_dst[i] = d_src[i];
-    }
-  } else {
-#pragma unroll
-    for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
-      *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
-    }
-  }
-}
-
-template <typename index_type, typename vec_type>
-__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
-                                    const vec_type* vectors, const int vec_size,
-                                    vec_type* output_vectors) {
-  constexpr int warp_size = 32;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<warp_size>(block);
-
-#if (CUDA_VERSION < 11060)
-  auto num_threads_per_grid = grid.size();
-#else
-  auto num_threads_per_grid = grid.num_threads();
-#endif
-
-  for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
-    int key_num = step * num_threads_per_grid + grid.thread_rank();
-    index_type idx = std::numeric_limits<index_type>::max();
-    if (key_num < num_keys) {
-      idx = query_indices[key_num];
-    }
-#pragma unroll 4
-    for (size_t j = 0; j < warp_size; j++) {
-      index_type current_idx = tile.shfl(idx, j);
-      index_type idx_write = tile.shfl(key_num, 0) + j;
-      if (current_idx == std::numeric_limits<index_type>::max()) continue;
-      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
-                                vectors + current_idx * vec_size);
-    }
-  }
-}
-
-template <typename index_type, typename vec_type>
-__global__ void distribute_vectors_kernel(const index_type* postions, const size_t num_keys,
-                                          const vec_type* vectors, const int vec_size,
-                                          vec_type* output_vectors) {
-  constexpr int warp_size = 32;
-
-  auto grid = cg::this_grid();
-  auto block = cg::this_thread_block();
-  auto tile = cg::tiled_partition<warp_size>(block);
-
-#if (CUDA_VERSION < 11060)
-  auto num_threads_per_grid = grid.size();
-#else
-  auto num_threads_per_grid = grid.num_threads();
-#endif
-
-  for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
-    size_t key_num = step * num_threads_per_grid + grid.thread_rank();
-    index_type idx = std::numeric_limits<index_type>::max();
-    if (key_num < num_keys) {
-      idx = postions[key_num];
-    }
-#pragma unroll 4
-    for (size_t j = 0; j < warp_size; j++) {
-      size_t idx_write = tile.shfl(idx, j);
-      size_t idx_read = tile.shfl(key_num, 0) + j;
-      if (idx_write == std::numeric_limits<index_type>::max()) continue;
-      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
-                                output_vectors + (size_t)idx_write * vec_size,
-                                vectors + (size_t)idx_read * vec_size);
-    }
-  }
-}
-
-}  // namespace
-
-namespace gpu_cache {
-template <typename key_type, typename index_type, typename vec_type>
-UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
-                                                   const size_t host_table_capacity,
-                                                   const int max_batch_size, const int vec_size,
-                                                   const vec_type default_value)
-    : max_batch_size_(std::max(100000, max_batch_size)),
-      vec_size_(vec_size),
-      num_set_((device_table_capacity - 1) / set_size + 1),
-      num_host_set_((host_table_capacity - 1) / set_size + 1),
-      table_capacity_(num_set_ * set_size),
-      default_vector_(vec_size, default_value),
-      device_table_(device_table_capacity, set_size, max_batch_size_),
-      host_table_(host_table_capacity * 1.3, set_size, max_batch_size_) {
-  CUDA_CHECK(cudaMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
-  CUDA_CHECK(cudaMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
-
-  CUDA_CHECK(cudaMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMallocHost(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
-  CUDA_CHECK(cudaMalloc(&d_missing_count_, sizeof(int)));
-  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(int)));
-  CUDA_CHECK(cudaStreamCreate(&query_stream_));
-  for (int i = 0; i < num_buffers_; i++) {
-    int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
-    CUDA_CHECK(
-        cudaMallocHost(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
-    CUDA_CHECK(cudaMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
-    CUDA_CHECK(cudaStreamCreate(&cpy_streams_[i]));
-    CUDA_CHECK(cudaEventCreate(&cpy_events_[i]));
-  }
-  CUDA_CHECK(cudaMallocHost(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
-  CUDA_CHECK(cudaEventCreate(&query_event_));
-  h_vectors_.resize(host_table_.capacity * vec_size_);
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
-                                                   const vec_type* h_vectors,
-                                                   const size_t num_keys) {
-  std::vector<key_type> h_missing_keys;
-  size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    size_t prev_size = h_missing_keys.size();
-    h_missing_keys.resize(prev_size + num_missing_keys);
-    CUDA_CHECK(cudaMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
-  }
-
-  std::vector<key_type> h_final_missing_keys;
-  num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    size_t prev_size = h_final_missing_keys.size();
-    h_final_missing_keys.resize(prev_size + num_missing_keys);
-    CUDA_CHECK(cudaMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
-  }
-
-  std::vector<key_type> h_keys_buffer(max_batch_size_);
-  std::vector<index_type> h_indices_buffer(max_batch_size_);
-  std::vector<int> h_positions_buffer(max_batch_size_);
-
-  num_batches = (num_keys - 1) / max_batch_size_ + 1;
-
-  size_t num_hit_keys = 0;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size =
-        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
-    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
-                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-    device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
-                        d_missing_positions_, d_missing_count_, 0);
-    CUDA_CHECK(cudaStreamSynchronize(0));
-
-    CUDA_CHECK(cudaMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
-                          sizeof(*d_vectors_) * this_batch_size * vec_size_,
-                          cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaStreamSynchronize(0));
-    if (num_hit_keys < device_table_.capacity) {
-      distribute_vectors_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, 0>>>(
-          d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
-      CUDA_CHECK(cudaStreamSynchronize(0));
-    }
-
-    int num_missing_keys;
-    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
-                          cudaMemcpyDeviceToHost));
-    num_hit_keys += this_batch_size - num_missing_keys;
-    host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
-                      nullptr, 0);
-
-    CUDA_CHECK(cudaMemcpy(h_keys_buffer.data(), d_missing_keys_,
-                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    CUDA_CHECK(cudaMemcpy(h_indices_buffer.data(), d_output_indices_,
-                          sizeof(*d_output_indices_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    CUDA_CHECK(cudaMemcpy(h_positions_buffer.data(), d_missing_positions_,
-                          sizeof(*d_missing_positions_) * num_missing_keys, cudaMemcpyDeviceToHost))
-
-    for (int j = 0; j < num_missing_keys; j++) {
-      if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
-        memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
-               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
-               sizeof(*h_vectors) * vec_size_);
-      } else {
-        size_t prev_idx = h_vectors_.size() / vec_size_;
-        h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
-        h_vectors_.resize(h_vectors_.size() + vec_size_);
-        memcpy(h_vectors_.data() + prev_idx * vec_size_,
-               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
-               sizeof(*h_vectors) * vec_size_);
-      }
-    }
-  }
-  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
-                                                     vec_type* d_vectors, cudaStream_t stream) {
-  if (!num_keys) return;
-  CUDA_CHECK(cudaEventRecord(query_event_, stream));
-  CUDA_CHECK(cudaStreamWaitEvent(query_stream_, query_event_));
-
-  static_assert(num_buffers_ >= 2);
-  device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
-                      d_missing_count_, query_stream_);
-
-  CUDA_CHECK(cudaEventRecord(query_event_, query_stream_));
-  CUDA_CHECK(cudaStreamWaitEvent(cpy_streams_[0], query_event_));
-
-  int num_missing_keys;
-  CUDA_CHECK(cudaMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
-                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
-
-  host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
-  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
-
-  CUDA_CHECK(cudaMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
-
-  CUDA_CHECK(cudaMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
-                             sizeof(index_type) * num_missing_keys, cudaMemcpyDeviceToHost,
-                             query_stream_));
-
-  CUDA_CHECK(cudaMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
-                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
-
-  read_vectors_kernel<<<(num_keys - 1) / block_size + 1, block_size, 0, cpy_streams_[1]>>>(
-      d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
-
-  CUDA_CHECK(cudaStreamSynchronize(query_stream_));
-  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
-
-  int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
-
-  for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
-    int num_keys_this_buffer = buffer_num != num_buffers_ - 1
-                                   ? num_keys_per_buffer
-                                   : num_missing_keys - num_keys_per_buffer * buffer_num;
-    if (!num_keys_this_buffer) break;
-#pragma omp parallel for num_threads(8)
-    for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
-      size_t idx_key = buffer_num * num_keys_per_buffer + i;
-      index_type index = h_output_host_indices_[idx_key];
-      if (index == std::numeric_limits<index_type>::max()) {
-        key_type key = h_missing_keys_[idx_key];
-        auto iterator = h_final_missing_items_.find(key);
-        if (iterator != h_final_missing_items_.end()) {
-          index = iterator->second;
-        }
-      }
-      if (index != std::numeric_limits<index_type>::max()) {
-        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
-               sizeof(vec_type) * vec_size_);
-      } else {
-        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
-               sizeof(vec_type) * vec_size_);
-      }
-    }
-    CUDA_CHECK(cudaMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
-                               sizeof(vec_type) * num_keys_this_buffer * vec_size_,
-                               cudaMemcpyHostToDevice, cpy_streams_[buffer_num]));
-
-    distribute_vectors_kernel<<<(num_keys_this_buffer - 1) / block_size + 1, block_size, 0,
-                                cpy_streams_[buffer_num]>>>(
-        d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
-        d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
-  }
-
-  for (int i = 0; i < num_buffers_; i++) {
-    CUDA_CHECK(cudaEventRecord(cpy_events_[i], cpy_streams_[i]));
-    CUDA_CHECK(cudaStreamWaitEvent(stream, cpy_events_[i]));
-  }
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-void UvmTable<key_type, index_type, vec_type>::clear(cudaStream_t stream) {
-  device_table_.clear(stream);
-  host_table_.clear(stream);
-}
-
-template <typename key_type, typename index_type, typename vec_type>
-UvmTable<key_type, index_type, vec_type>::~UvmTable() {
-  CUDA_CHECK(cudaFree(d_keys_buffer_));
-  CUDA_CHECK(cudaFree(d_vectors_buffer_));
-  CUDA_CHECK(cudaFree(d_vectors_));
-
-  CUDA_CHECK(cudaFree(d_output_indices_));
-  CUDA_CHECK(cudaFree(d_output_host_indices_));
-  CUDA_CHECK(cudaFreeHost(h_output_host_indices_));
-
-  CUDA_CHECK(cudaFree(d_missing_keys_));
-  CUDA_CHECK(cudaFree(d_missing_positions_));
-  CUDA_CHECK(cudaFree(d_missing_count_));
-  CUDA_CHECK(cudaFreeHost(h_missing_keys_));
-
-  CUDA_CHECK(cudaStreamDestroy(query_stream_));
-  CUDA_CHECK(cudaEventDestroy(query_event_));
-
-  for (int i = 0; i < num_buffers_; i++) {
-    CUDA_CHECK(cudaFreeHost(h_cpy_buffers_[i]));
-    CUDA_CHECK(cudaFree(d_cpy_buffers_[i]));
-    CUDA_CHECK(cudaStreamDestroy(cpy_streams_[i]));
-    CUDA_CHECK(cudaEventDestroy(cpy_events_[i]));
-  }
-}
-
-template <typename key_type, typename index_type>
-HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
-    : max_set_size_(set_size), batch_size_(batch_size) {
-  if (expected_capacity) {
-    num_sets = (expected_capacity - 1) / set_size + 1;
-  } else {
-    num_sets = 10000;
-  }
-  capacity = num_sets * set_size;
-  CUDA_CHECK(cudaMalloc(&keys, sizeof(*keys) * capacity));
-  CUDA_CHECK(cudaMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
-  CUDA_CHECK(cudaMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
-}
-
-template <typename key_type, typename index_type>
-HashBlock<key_type, index_type>::~HashBlock() {
-  CUDA_CHECK(cudaFree(keys));
-  CUDA_CHECK(cudaFree(set_sizes_));
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
-                                            index_type* output_indices, key_type* missing_keys,
-                                            int* missing_positions, int* num_missing_keys,
-                                            cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
-    hash_query_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
-        query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
-        missing_positions, num_missing_keys);
-  }
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
-                                            index_type* output_indices, cudaStream_t stream) {
-  hash_query_kernel<<<128, 64, 0, stream>>>(query_keys, num_keys, keys, num_sets, max_set_size_,
-                                            output_indices);
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
-                                          key_type* missing_keys, int* num_missing_keys,
-                                          cudaStream_t stream) {
-  if (num_keys == 0) {
-    return;
-  }
-  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
-  for (size_t i = 0; i < num_batches; i++) {
-    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
-    hash_add_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
-        new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
-        missing_keys, num_missing_keys);
-  }
-}
-
-template <typename key_type, typename index_type>
-void HashBlock<key_type, index_type>::clear(cudaStream_t stream) {
-  CUDA_CHECK(cudaMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
-}
-
-template class HashBlock<int, size_t>;
-template class HashBlock<int64_t, size_t>;
-template class HashBlock<size_t, size_t>;
-template class HashBlock<unsigned int, size_t>;
-template class HashBlock<long long, size_t>;
-
-template class UvmTable<int, size_t>;
-template class UvmTable<int64_t, size_t>;
-template class UvmTable<size_t, size_t>;
-template class UvmTable<unsigned int, size_t>;
-template class UvmTable<long long, size_t>;
-}  // namespace gpu_cache
\ No newline at end of file
diff --git a/third_party/HugeCTR/gpu_cache/test/CMakeLists.txt b/third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
deleted file mode 100644
index 322798222f0f..000000000000
--- a/third_party/HugeCTR/gpu_cache/test/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#      http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-cmake_minimum_required(VERSION 3.8)
-file(GLOB gpu_cache_test_src
-  cache_op_sol_test.cu
-  ../../HugeCTR/src/hps/embedding_cache_gpu.cu
-)
-
-add_executable(cache_op_sol_test ${gpu_cache_test_src})
-target_compile_features(cache_op_sol_test PUBLIC cxx_std_17)
-target_link_libraries(cache_op_sol_test PUBLIC gpu_cache)
-target_link_libraries(cache_op_sol_test PUBLIC OpenMP::OpenMP_CXX)
-set_target_properties(cache_op_sol_test PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-set_target_properties(cache_op_sol_test PROPERTIES CUDA_ARCHITECTURES OFF)
-
diff --git a/third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu b/third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu
deleted file mode 100644
index 3544c694f439..000000000000
--- a/third_party/HugeCTR/gpu_cache/test/cache_op_sol_test.cu
+++ /dev/null
@@ -1,707 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <omp.h>
-#include <sys/time.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <hps/embedding_cache_gpu.hpp>
-#include <iostream>
-#include <nv_gpu_cache.hpp>
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-// The key generator
-template <typename T, typename set_hasher = MurmurHash3_32<T>>
-class KeyGenerator {
- public:
-  KeyGenerator() : gen_(rd_()) {}
-  KeyGenerator(T min, T max) : gen_(rd_()), distribution_(min, max) {}
-
-  void fill_unique(T* data, size_t keys_per_set, size_t num_of_set, T empty_value) {
-    if (keys_per_set == 0 || num_of_set == 0) {
-      return;
-    }
-    assert(distribution_.max() - distribution_.min() >= keys_per_set * num_of_set);
-
-    std::unordered_set<T> set;
-    std::vector<size_t> set_sz(num_of_set, 0);
-    size_t sz = 0;
-    while (sz < keys_per_set * num_of_set) {
-      T x = distribution_(gen_);
-      if (x == empty_value) {
-        continue;
-      }
-      auto res = set.insert(x);
-      if (res.second) {
-        size_t src_set = set_hasher::hash(x) % num_of_set;
-        if (set_sz[src_set] < keys_per_set) {
-          data[src_set * keys_per_set + set_sz[src_set]] = x;
-          set_sz[src_set]++;
-          sz++;
-        }
-      }
-    }
-    assert(sz == keys_per_set * num_of_set);
-    for (size_t i = 0; i < num_of_set; i++) {
-      assert(set_sz[i] == keys_per_set);
-    }
-  }
-
- private:
-  std::random_device rd_;
-  std::mt19937 gen_;
-  std::uniform_int_distribution<T> distribution_;
-};
-// The random number generator
-template <typename T>
-class IntGenerator {
- public:
-  IntGenerator() : gen_(rd_()) {}
-  IntGenerator(T min, T max) : gen_(rd_()), distribution_(min, max) {}
-
-  void fill_unique(T* data, size_t len, T empty_value) {
-    if (len == 0) {
-      return;
-    }
-    assert(distribution_.max() - distribution_.min() >= len);
-
-    std::unordered_set<T> set;
-    size_t sz = 0;
-    while (sz < len) {
-      T x = distribution_(gen_);
-      if (x == empty_value) {
-        continue;
-      }
-      auto res = set.insert(x);
-      if (res.second) {
-        data[sz++] = x;
-      }
-    }
-    assert(sz == set.size());
-    assert(sz == len);
-  }
-
- private:
-  std::random_device rd_;
-  std::mt19937 gen_;
-  std::uniform_int_distribution<T> distribution_;
-};
-
-template <typename T>
-class IntGenerator_normal {
- public:
-  IntGenerator_normal() : gen_(rd_()) {}
-  IntGenerator_normal(double mean, double dev) : gen_(rd_()), distribution_(mean, dev) {}
-
-  void fill_unique(T* data, size_t len, T min, T max) {
-    if (len == 0) {
-      return;
-    }
-
-    std::unordered_set<T> set;
-    size_t sz = 0;
-    while (sz < len) {
-      T x = (T)(abs(distribution_(gen_)));
-      if (x < min || x > max) {
-        continue;
-      }
-      auto res = set.insert(x);
-      if (res.second) {
-        data[sz++] = x;
-      }
-    }
-    assert(sz == set.size());
-    assert(sz == len);
-  }
-
- private:
-  std::random_device rd_;
-  std::mt19937 gen_;
-  std::normal_distribution<double> distribution_;
-};
-
-// Utility to fill len embedding vector
-template <typename KeyType>
-void fill_vec(const KeyType* keys, float* vals, size_t embedding_vec_size, size_t len,
-              float ratio) {
-  for (size_t i = 0; i < len; ++i) {
-    for (size_t j = 0; j < embedding_vec_size; ++j) {
-      vals[i * embedding_vec_size + j] = (float)(ratio * keys[i]);
-    }
-  }
-}
-
-// Floating-point compare function
-template <typename T>
-bool is_near(T a, T b) {
-  double diff = abs(a - b);
-  bool ret = diff <= std::min(a, b) * 1e-6;
-  if (!ret) {
-    std::cerr << "error: " << a << " != " << b << "; diff = " << diff << std::endl;
-  }
-  return ret;
-}
-
-// Check correctness of result
-template <typename KeyType>
-void check_result(const KeyType* keys, const float* vals, size_t embedding_vec_size, size_t len,
-                  float ratio) {
-  for (size_t i = 0; i < len; ++i) {
-    for (size_t j = 0; j < embedding_vec_size; ++j) {
-      assert(is_near(vals[i * embedding_vec_size + j], (float)(ratio * keys[i])));
-    }
-  }
-}
-
-// Compare two sequence of keys and check whether they are the same(but with different order)
-template <typename KeyType>
-void compare_key(const KeyType* sequence_a, const KeyType* sequence_b, size_t len) {
-  // Temp buffers for sorting
-  KeyType* sequence_a_copy = (KeyType*)malloc(len * sizeof(KeyType));
-  KeyType* sequence_b_copy = (KeyType*)malloc(len * sizeof(KeyType));
-  // Copy data to temp buffers
-  memcpy(sequence_a_copy, sequence_a, len * sizeof(KeyType));
-  memcpy(sequence_b_copy, sequence_b, len * sizeof(KeyType));
-  // Sort both arrays
-  std::sort(sequence_a_copy, sequence_a_copy + len);
-  std::sort(sequence_b_copy, sequence_b_copy + len);
-
-  // Linearly compare elements
-  for (size_t i = 0; i < len; i++) {
-    assert(sequence_a_copy[i] == sequence_b_copy[i]);
-  }
-  // Free temp buffers
-  free(sequence_a_copy);
-  free(sequence_b_copy);
-}
-
-/* Timing funtion */
-double W_time() {
-  timeval marker;
-  gettimeofday(&marker, NULL);
-  return ((double)(marker.tv_sec) * 1e6 + (double)(marker.tv_usec)) * 1e-6;
-}
-
-using key_type = uint32_t;
-using ref_counter_type = uint64_t;
-
-int main(int argc, char** argv) {
-  if (argc != 8) {
-    std::cerr << "usage: " << argv[0]
-              << " embedding_table_size cache_capacity_in_set embedding_vec_size query_length "
-                 "iter_round num_threads cache_type"
-              << std::endl;
-    return -1;
-  }
-
-  // Arguments
-  const size_t emb_size = atoi(argv[1]);
-  const size_t cache_capacity_in_set = atoi(argv[2]);
-  const size_t embedding_vec_size = atoi(argv[3]);
-  const size_t query_length = atoi(argv[4]);
-  const size_t iter_round = atoi(argv[5]);
-  const size_t num_threads = atoi(argv[6]);
-  const size_t cache_type = atoi(argv[7]);
-
-  // Since cache is designed for single-gpu, all threads just use GPU 0
-  CUDA_CHECK(cudaSetDevice(0));
-
-  // Host side buffers shared between threads
-  key_type* h_keys;  // Buffer holding all keys in embedding table
-  float* h_vals;     // Buffer holding all values in embedding table
-
-  // host-only buffers placed in normal host memory
-  h_keys = (key_type*)malloc(emb_size * sizeof(key_type));
-  h_vals = (float*)malloc(emb_size * embedding_vec_size * sizeof(float));
-
-  // The empty key to be used
-  const key_type empty_key = std::numeric_limits<key_type>::max();
-  gpu_cache::gpu_cache_api<key_type>* cache = nullptr;
-
-  // The cache to be used, by default the set hasher is based on MurMurHash and slab hasher is based
-  // on Mod.
-
-  if (cache_type == 0) {
-    using Cache_ =
-        gpu_cache::gpu_cache<key_type, ref_counter_type, empty_key, SET_ASSOCIATIVITY, SLAB_SIZE>;
-    cache = new Cache_(cache_capacity_in_set, embedding_vec_size);
-  } else {
-    cache = new HugeCTR::EmbeddingCacheWrapper<key_type>(cache_capacity_in_set, embedding_vec_size);
-  }
-
-  // For random unique keys generation
-  IntGenerator<key_type> gen_key;
-  float increase = 0.1f;
-
-  // Start 1st test
-  std::cout << "****************************************" << std::endl;
-  std::cout << "****************************************" << std::endl;
-  std::cout << "Start Single-GPU Thread-safe Query and Replace API test " << std::endl;
-
-  // Timimg variables
-  double time_a;
-  double time_b;
-
-  time_a = W_time();
-
-  std::cout << "Filling data" << std::endl;
-  // Generating random unique keys
-  gen_key.fill_unique(h_keys, emb_size, empty_key);
-  // Filling values vector according to the keys
-  fill_vec(h_keys, h_vals, embedding_vec_size, emb_size, increase);
-
-  // Elapsed wall time
-  time_b = W_time() - time_a;
-  std::cout << "The Elapsed time for filling data is: " << time_b << "sec." << std::endl;
-
-  // Insert <k,v> pairs to embedding table (CPU hashtable)
-  time_a = W_time();
-
-  std::cout << "Filling embedding table" << std::endl;
-  std::unordered_map<key_type, std::vector<float>> h_emb_table;
-  for (size_t i = 0; i < emb_size; i++) {
-    std::vector<float> emb_vec(embedding_vec_size);
-    for (size_t j = 0; j < embedding_vec_size; j++) {
-      emb_vec[j] = h_vals[i * embedding_vec_size + j];
-    }
-    h_emb_table.emplace(h_keys[i], emb_vec);
-  }
-
-  // Elapsed wall time
-  time_b = W_time() - time_a;
-  std::cout << "The Elapsed time for filling embedding table is: " << time_b << "sec." << std::endl;
-
-  // Free value buffer
-  free(h_vals);
-
-#pragma omp parallel default(none)                                                           \
-    shared(h_keys, cache, h_emb_table, increase, embedding_vec_size, query_length, emb_size, \
-           iter_round, std::cout, cache_type) num_threads(num_threads)
-  {
-    // The thread ID for this thread
-    int thread_id = omp_get_thread_num();
-    printf("Worker %d starts testing cache.\n", thread_id);
-    // Since cache is designed for single-gpu, all threads just use GPU 0
-    CUDA_CHECK(cudaSetDevice(0));
-
-    // Thread-private host side buffers
-    size_t* h_query_keys_index;  // Buffer holding index for keys to be queried
-    key_type* h_query_keys;      // Buffer holding keys to be queried
-    float* h_vals_retrieved;     // Buffer holding values retrieved from query
-    key_type* h_missing_keys;    // Buffer holding missing keys from query
-    float* h_missing_vals;       // Buffer holding values for missing keys
-    uint64_t* h_missing_index;   // Buffers holding missing index from query
-    size_t h_missing_len;        // missing length
-
-    // Thread-private device side buffers
-    key_type* d_query_keys;     // Buffer holding keys to be queried
-    float* d_vals_retrieved;    // Buffer holding values retrieved from query
-    key_type* d_missing_keys;   // Buffer holding missing keys from query
-    float* d_missing_vals;      // Buffer holding values for missing keys
-    uint64_t* d_missing_index;  // Buffers holding missing index from query
-    size_t* d_missing_len;      // missing length
-
-    // host-only buffers placed in normal host memory
-    h_query_keys_index = (size_t*)malloc(query_length * sizeof(size_t));
-    // host-device interactive buffers placed in pinned memory
-    CUDA_CHECK(cudaHostAlloc((void**)&h_query_keys, query_length * sizeof(key_type),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_vals_retrieved,
-                             query_length * embedding_vec_size * sizeof(float),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_keys, query_length * sizeof(key_type),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_vals,
-                             query_length * embedding_vec_size * sizeof(float),
-                             cudaHostAllocPortable));
-    CUDA_CHECK(cudaHostAlloc((void**)&h_missing_index, query_length * sizeof(uint64_t),
-                             cudaHostAllocPortable));
-
-    // Allocate device side buffers
-    CUDA_CHECK(cudaMalloc((void**)&d_query_keys, query_length * sizeof(key_type)));
-    CUDA_CHECK(
-        cudaMalloc((void**)&d_vals_retrieved, query_length * embedding_vec_size * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_keys, query_length * sizeof(key_type)));
-    CUDA_CHECK(
-        cudaMalloc((void**)&d_missing_vals, query_length * embedding_vec_size * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_index, query_length * sizeof(uint64_t)));
-    CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
-
-    // Thread-private CUDA stream, all threads just use the #0 device
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    // Timimg variables
-    double time_1;
-    double time_2;
-
-    /******************************************************************************
-    *******************************************************************************
-    ********************************Test start*************************************
-    *******************************************************************************
-    *******************************************************************************/
-
-    // Normal-distribution random number generator
-    size_t foot_print = emb_size - 1;  // Memory footprint for access the keys within the key buffer
-    double mean = (double)(emb_size / 2);     // mean for normal distribution
-    double dev = (double)(2 * query_length);  // dev for normal distribution
-    // IntGenerator<size_t> uni_gen(0, foot_print);
-    // Normal-distribution random number generator
-    IntGenerator_normal<size_t> normal_gen(mean, dev);
-
-    // Start normal distribution cache test
-    printf("Worker %d : normal distribution test start.\n", thread_id);
-    for (size_t i = 0; i < iter_round; i++) {
-      // Generate random index with normal-distribution
-      normal_gen.fill_unique(h_query_keys_index, query_length, 0, foot_print);
-      // Select keys from total keys buffer with the index
-      for (size_t j = 0; j < query_length; j++) {
-        h_query_keys[j] = h_keys[h_query_keys_index[j]];
-        // std::cout << h_query_keys[j] << " ";
-      }
-      std::cout << std::endl;
-
-      // Copy the keys to GPU memory
-      CUDA_CHECK(cudaMemcpyAsync(d_query_keys, h_query_keys, query_length * sizeof(key_type),
-                                 cudaMemcpyHostToDevice, stream));
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Record time
-      time_1 = W_time();
-      // Get pairs from hashtable
-      cache->Query(d_query_keys, query_length, d_vals_retrieved, d_missing_index, d_missing_keys,
-                   d_missing_len, stream);
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Elapsed wall time
-      time_2 = W_time() - time_1;
-      printf("Worker %d : The Elapsed time for %zu round normal-distribution query is: %f sec.\n",
-             thread_id, i, time_2);
-
-      // Copy the data back to host
-      CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
-                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_missing_index, d_missing_index, query_length * sizeof(uint64_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_missing_keys, d_missing_keys, query_length * sizeof(key_type),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      printf("Worker %d : %zu round : Missing key: %zu. Hit rate: %f %%.\n", thread_id, i,
-             h_missing_len, 100.0f - (((float)h_missing_len / (float)query_length) * 100.0f));
-
-      // Get missing key from embedding table
-      // Insert missing values into the retrieved value buffer
-      // Record time
-      time_1 = W_time();
-      for (size_t missing_idx = 0; missing_idx < h_missing_len; missing_idx++) {
-        auto result = h_emb_table.find(h_missing_keys[missing_idx]);
-        for (size_t emb_vec_idx = 0; emb_vec_idx < embedding_vec_size; emb_vec_idx++) {
-          h_missing_vals[missing_idx * embedding_vec_size + emb_vec_idx] =
-              (result->second)[emb_vec_idx];
-          h_vals_retrieved[h_missing_index[missing_idx] * embedding_vec_size + emb_vec_idx] =
-              (result->second)[emb_vec_idx];
-        }
-      }
-      // Elapsed wall time
-      time_2 = W_time() - time_1;
-      printf(
-          "Worker %d : The Elapsed time for %zu round normal-distribution fetching missing data "
-          "is: %f sec.\n",
-          thread_id, i, time_2);
-
-      // Copy the missing value to device
-      CUDA_CHECK(cudaMemcpyAsync(d_missing_vals, h_missing_vals,
-                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyHostToDevice, stream));
-      CUDA_CHECK(cudaMemcpyAsync(d_vals_retrieved, h_vals_retrieved,
-                                 query_length * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyHostToDevice, stream));
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-
-      // Record time
-      time_1 = W_time();
-      // Replace the missing <k,v> pairs into the cache
-      if (cache_type == 0)
-        cache->Replace(d_missing_keys, h_missing_len, d_missing_vals, stream);
-      else
-        cache->Replace(d_query_keys, query_length, d_vals_retrieved, stream);
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Elapsed wall time
-      time_2 = W_time() - time_1;
-      printf("Worker %d : The Elapsed time for %zu round normal-distribution replace is: %f sec.\n",
-             thread_id, i, time_2);
-
-      // Verification: Check for correctness for retrieved pairs
-      check_result(h_query_keys, h_vals_retrieved, embedding_vec_size, query_length, increase);
-      printf(
-          "Worker %d : Result check for %zu round normal-distribution query+replace "
-          "successfully!\n",
-          thread_id, i);
-    }
-
-    printf("Worker %d : All Finished!\n", thread_id);
-
-    // Clean-up
-    cudaStreamDestroy(stream);
-    free(h_query_keys_index);
-    CUDA_CHECK(cudaFreeHost(h_query_keys));
-    CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
-    CUDA_CHECK(cudaFreeHost(h_missing_keys));
-    CUDA_CHECK(cudaFreeHost(h_missing_vals));
-    CUDA_CHECK(cudaFreeHost(h_missing_index));
-
-    CUDA_CHECK(cudaFree(d_query_keys));
-    CUDA_CHECK(cudaFree(d_vals_retrieved));
-    CUDA_CHECK(cudaFree(d_missing_keys));
-    CUDA_CHECK(cudaFree(d_missing_vals));
-    CUDA_CHECK(cudaFree(d_missing_index));
-    CUDA_CHECK(cudaFree(d_missing_len));
-  }
-
-  // 1st test Clean-up
-  free(h_keys);
-  delete cache;
-
-  // Start 2nd test
-  std::cout << "****************************************" << std::endl;
-  std::cout << "****************************************" << std::endl;
-  std::cout << "Start Single-GPU Thread-safe Update and Dump API test " << std::endl;
-
-  // The key and value buffer that contains all the keys and values to be inserted into the cache
-  h_keys =
-      (key_type*)malloc(SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type));
-  h_vals = (float*)malloc(SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
-                          embedding_vec_size * sizeof(float));
-  float* h_new_vals = (float*)malloc(SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
-                                     embedding_vec_size * sizeof(float));
-
-  // The cache object to be tested
-  if (cache_type == 0) {
-    using Cache_ =
-        gpu_cache::gpu_cache<key_type, ref_counter_type, empty_key, SET_ASSOCIATIVITY, SLAB_SIZE>;
-    cache = new Cache_(cache_capacity_in_set, embedding_vec_size);
-  } else {
-    cache = new HugeCTR::EmbeddingCacheWrapper<key_type>(cache_capacity_in_set, embedding_vec_size);
-  }
-
-  // Key generator
-  KeyGenerator<key_type> cache_key_gen;
-  // Generating random unique keys
-  cache_key_gen.fill_unique(h_keys, SLAB_SIZE * SET_ASSOCIATIVITY, cache_capacity_in_set,
-                            empty_key);
-  // Filling values vector according to the keys
-  fill_vec(h_keys, h_vals, embedding_vec_size,
-           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, increase);
-  // Set new value
-  increase = 1.0f;
-  // Filling values vector according to the keys
-  fill_vec(h_keys, h_new_vals, embedding_vec_size,
-           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, increase);
-
-  // Host-side buffers
-  // Buffers holding keys and values to be inserted, each time insert 1 slab to every slabset
-  key_type* h_insert_keys;
-  float* h_insert_vals;
-  // Buffers holding keys and values dumped and retrieved from the cache
-  key_type* h_dump_keys;
-  float* h_vals_retrieved;
-  size_t h_dump_counter;
-  size_t h_missing_len;
-  key_type* h_acc_keys;
-
-  // Device-side buffers
-  key_type* d_keys;
-  float* d_vals;
-  // Buffers holding keys and values to be inserted, each time insert 1 slab to every slabset
-  key_type* d_insert_keys;
-  float* d_insert_vals;
-  // Buffers holding keys and values dumped and retrieved from the cache
-  key_type* d_dump_keys;
-  float* d_vals_retrieved;
-  size_t* d_dump_counter;
-  uint64_t* d_missing_index;
-  key_type* d_missing_keys;
-  size_t* d_missing_len;
-
-  CUDA_CHECK(cudaHostAlloc((void**)&h_insert_keys,
-                           SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_insert_vals,
-                           SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_dump_keys,
-                           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc(
-      (void**)&h_vals_retrieved,
-      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-      cudaHostAllocPortable));
-  CUDA_CHECK(cudaHostAlloc((void**)&h_acc_keys,
-                           SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-                           cudaHostAllocPortable));
-
-  CUDA_CHECK(cudaMalloc((void**)&d_keys,
-                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_vals, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set *
-                                             embedding_vec_size * sizeof(float)));
-  CUDA_CHECK(
-      cudaMalloc((void**)&d_insert_keys, SLAB_SIZE * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_insert_vals,
-                        SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
-  CUDA_CHECK(cudaMalloc((void**)&d_dump_keys,
-                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc(
-      (void**)&d_vals_retrieved,
-      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float)));
-  CUDA_CHECK(cudaMalloc((void**)&d_dump_counter, sizeof(size_t)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_index,
-                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(uint64_t)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_keys,
-                        SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type)));
-  CUDA_CHECK(cudaMalloc((void**)&d_missing_len, sizeof(size_t)));
-
-  // CUDA stream
-  cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
-
-  // Copy all keys and values from host to device
-  CUDA_CHECK(cudaMemcpyAsync(
-      d_keys, h_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * sizeof(key_type),
-      cudaMemcpyHostToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(
-      d_vals, h_new_vals,
-      SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-      cudaMemcpyHostToDevice, stream));
-  // Wait for stream to complete
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-
-  // Each time insert 1 slab per slabset into the cache and check result
-  for (size_t i = 0; i < SET_ASSOCIATIVITY; i++) {
-    // Prepare the keys and values to be inserted
-    for (size_t j = 0; j < cache_capacity_in_set; j++) {
-      memcpy(h_insert_keys + j * SLAB_SIZE,
-             h_keys + j * SLAB_SIZE * SET_ASSOCIATIVITY + i * SLAB_SIZE,
-             SLAB_SIZE * sizeof(key_type));
-      memcpy(h_insert_vals + j * SLAB_SIZE * embedding_vec_size,
-             h_vals + (j * SLAB_SIZE * SET_ASSOCIATIVITY + i * SLAB_SIZE) * embedding_vec_size,
-             SLAB_SIZE * embedding_vec_size * sizeof(float));
-    }
-    // Copy the selected keys to accumulate buffer
-    memcpy(h_acc_keys + SLAB_SIZE * cache_capacity_in_set * i, h_insert_keys,
-           SLAB_SIZE * cache_capacity_in_set * sizeof(key_type));
-
-    // Copy the <k,v> pairs from host to device
-    CUDA_CHECK(cudaMemcpyAsync(d_insert_keys, h_insert_keys,
-                               SLAB_SIZE * cache_capacity_in_set * sizeof(key_type),
-                               cudaMemcpyHostToDevice, stream));
-    CUDA_CHECK(
-        cudaMemcpyAsync(d_insert_vals, h_insert_vals,
-                        SLAB_SIZE * cache_capacity_in_set * embedding_vec_size * sizeof(float),
-                        cudaMemcpyHostToDevice, stream));
-    // Insert the <k,v> pairs into the cache
-    cache->Replace(d_insert_keys, SLAB_SIZE * cache_capacity_in_set, d_insert_vals, stream);
-    // Wait for stream to complete
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    // Record time
-    time_a = W_time();
-    // Update the new values to the cache(including missing keys)
-    cache->Update(d_keys, SLAB_SIZE * SET_ASSOCIATIVITY * cache_capacity_in_set, d_vals, stream,
-                  SLAB_SIZE);
-    // Wait for stream to complete
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    // Elapsed wall time
-    time_b = W_time() - time_a;
-    printf("The Elapsed time for %zu round update is: %f sec.\n", i, time_b);
-    bool check_dump = false;
-    if (check_dump) {
-      // Record time
-      time_a = W_time();
-      // Dump the keys from the cache
-      cache->Dump(d_dump_keys, d_dump_counter, 0, cache_capacity_in_set, stream);
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Elapsed wall time
-      time_b = W_time() - time_a;
-      printf("The Elapsed time for %zu round dump is: %f sec.\n", i, time_b);
-
-      // Copy the dump counter from device to host
-      CUDA_CHECK(cudaMemcpyAsync(&h_dump_counter, d_dump_counter, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Check the dump counter
-      assert(h_dump_counter == SLAB_SIZE * cache_capacity_in_set * (i + 1));
-      // Query all the dumped keys from the cache
-      cache->Query(d_dump_keys, h_dump_counter, d_vals_retrieved, d_missing_index, d_missing_keys,
-                   d_missing_len, stream);
-      // Copy result from device to host
-      CUDA_CHECK(cudaMemcpyAsync(h_dump_keys, d_dump_keys, h_dump_counter * sizeof(key_type),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(h_vals_retrieved, d_vals_retrieved,
-                                 h_dump_counter * embedding_vec_size * sizeof(float),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaMemcpyAsync(&h_missing_len, d_missing_len, sizeof(size_t),
-                                 cudaMemcpyDeviceToHost, stream));
-      // Wait for stream to complete
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      // Check result
-      assert(h_missing_len == 0);
-      compare_key(h_dump_keys, h_acc_keys, h_dump_counter);
-      check_result(h_dump_keys, h_vals_retrieved, embedding_vec_size, h_dump_counter, increase);
-    }
-  }
-
-  printf("Update and Dump API test all finished!\n");
-
-  // 2nd test clean-up
-  CUDA_CHECK(cudaStreamDestroy(stream));
-  free(h_keys);
-  free(h_vals);
-  free(h_new_vals);
-
-  CUDA_CHECK(cudaFreeHost(h_insert_keys));
-  CUDA_CHECK(cudaFreeHost(h_insert_vals));
-  CUDA_CHECK(cudaFreeHost(h_dump_keys));
-  CUDA_CHECK(cudaFreeHost(h_vals_retrieved));
-  CUDA_CHECK(cudaFreeHost(h_acc_keys));
-
-  CUDA_CHECK(cudaFree(d_keys));
-  CUDA_CHECK(cudaFree(d_vals));
-  CUDA_CHECK(cudaFree(d_insert_keys));
-  CUDA_CHECK(cudaFree(d_insert_vals));
-  CUDA_CHECK(cudaFree(d_dump_keys));
-  CUDA_CHECK(cudaFree(d_vals_retrieved));
-  CUDA_CHECK(cudaFree(d_dump_counter));
-  CUDA_CHECK(cudaFree(d_missing_index));
-  CUDA_CHECK(cudaFree(d_missing_keys));
-  CUDA_CHECK(cudaFree(d_missing_len));
-
-  delete cache;
-
-  return 0;
-}

From 1c0ff2c924be85104a54f5cc501bda7bc031f461 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 14 Aug 2024 22:26:01 -0400
Subject: [PATCH 19/78] [GraphBolt][CUDA] Keep CUDAStream only if cuda is
 available. (#7701)

---
 graphbolt/include/graphbolt/async.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/graphbolt/include/graphbolt/async.h b/graphbolt/include/graphbolt/async.h
index 3e9480fd2516..0d8f42470068 100644
--- a/graphbolt/include/graphbolt/async.h
+++ b/graphbolt/include/graphbolt/async.h
@@ -39,6 +39,7 @@
 #ifdef GRAPHBOLT_USE_CUDA
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/api/include/torch/cuda.h>
 #endif
 
 namespace graphbolt {
@@ -111,13 +112,23 @@ template <typename F>
 inline auto async(F&& function) {
   using T = decltype(function());
 #ifdef GRAPHBOLT_USE_CUDA
-  auto stream = c10::cuda::getCurrentCUDAStream();
+  const auto is_cuda_available = torch::cuda::is_available();
+  struct c10::StreamData3 stream_data;
+  if (is_cuda_available) {
+    stream_data = c10::cuda::getCurrentCUDAStream().pack3();
+  }
 #endif
   auto fn = [=, func = std::move(function)] {
 #ifdef GRAPHBOLT_USE_CUDA
     // We make sure to use the same CUDA stream as the thread launching the
     // async operation.
-    c10::cuda::CUDAStreamGuard guard(stream);
+    if (is_cuda_available) {
+      auto stream = c10::cuda::CUDAStream::unpack3(
+          stream_data.stream_id, stream_data.device_index,
+          stream_data.device_type);
+      c10::cuda::CUDAStreamGuard guard(stream);
+      return func();
+    }
 #endif
     return func();
   };

From 8a38fc9799a8075df04c8e443610b1481fa39602 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 14 Aug 2024 22:28:27 -0400
Subject: [PATCH 20/78] [GraphBolt][CUDA] Enable recent optimizations in the
 examples. (#7702)

---
 examples/graphbolt/link_prediction.py                |  5 ++++-
 examples/graphbolt/node_classification.py            |  1 +
 examples/graphbolt/pyg/labor/node_classification.py  |  1 +
 .../graphbolt/pyg/node_classification_advanced.py    |  1 +
 examples/graphbolt/rgcn/hetero_rgcn.py               | 12 +++++++++---
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/examples/graphbolt/link_prediction.py b/examples/graphbolt/link_prediction.py
index a48fbc25f115..60d48b57fc2d 100644
--- a/examples/graphbolt/link_prediction.py
+++ b/examples/graphbolt/link_prediction.py
@@ -182,7 +182,10 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
     # Initialize a neighbor sampler for sampling the neighborhoods of nodes.
     ############################################################################
     datapipe = datapipe.sample_neighbor(
-        graph, args.fanout if is_train else [-1]
+        graph,
+        args.fanout if is_train else [-1],
+        overlap_fetch=args.storage_device == "pinned",
+        asynchronous=args.storage_device != "cpu",
     )
 
     ############################################################################
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index e5e17de88bde..a4a8be298d2c 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -120,6 +120,7 @@ def create_dataloader(
         graph,
         fanout if job != "infer" else [-1],
         overlap_fetch=args.storage_device == "pinned",
+        asynchronous=args.storage_device != "cpu",
     )
 
     ############################################################################
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index 09f8cb3cf050..ab0c843fcec8 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -150,6 +150,7 @@ def create_dataloader(
         graph,
         fanout if job != "infer" else [-1],
         overlap_fetch=args.overlap_graph_fetch,
+        asynchronous=args.graph_device != "cpu",
         **kwargs,
     )
     # Copy the data to the specified device.
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 3b066a511b32..27bc82275d77 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -200,6 +200,7 @@ def create_dataloader(
         overlap_fetch=args.overlap_graph_fetch,
         num_gpu_cached_edges=args.num_gpu_cached_edges,
         gpu_cache_threshold=args.gpu_graph_caching_threshold,
+        asynchronous=args.graph_device != "cpu",
     )
     # Copy the data to the specified device.
     if args.feature_device != "cpu" and need_copy:
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index eec00e12f11f..60ab51602ca1 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -125,7 +125,10 @@ def create_dataloader(
     # `fanouts`:
     #   The number of neighbors to sample for each node in each layer.
     datapipe = datapipe.sample_neighbor(
-        graph, fanouts=fanouts, overlap_fetch=args.overlap_graph_fetch
+        graph,
+        fanouts=fanouts,
+        overlap_fetch=args.overlap_graph_fetch,
+        asynchronous=args.asynchronous,
     )
 
     # Fetch the features for each node in the mini-batch.
@@ -571,10 +574,13 @@ def main(args):
 
     # Move the dataset to the pinned memory to enable GPU access.
     args.overlap_graph_fetch = False
+    args.asynchronous = False
     if device == torch.device("cuda"):
-        g.pin_memory_()
-        features.pin_memory_()
+        g = g.pin_memory_()
+        features = features.pin_memory_()
+        # Enable optimizations for sampling on the GPU.
         args.overlap_graph_fetch = True
+        args.asynchronous = True
 
     feat_size = features.size("node", "paper", "feat")[0]
 

From c6fa8f446a123a93b2f78f3718ea062055ca755d Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:19:13 +0800
Subject: [PATCH 21/78] [dev] add more info about load failure of GraphBolt
 (#7684)

---
 python/dgl/graphbolt/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
index 2c980dd532da..7506e24c6f54 100644
--- a/python/dgl/graphbolt/__init__.py
+++ b/python/dgl/graphbolt/__init__.py
@@ -64,7 +64,12 @@ def load_graphbolt():
     path = os.path.join(dirname, "graphbolt", basename)
     if not os.path.exists(path):
         raise FileNotFoundError(
-            f"Cannot find DGL C++ graphbolt library at {path}"
+            f"Unable to locate the DGL C++ GraphBolt library at {path}. This "
+            "error typically occurs due to a version mismatch between the "
+            "installed DGL and the PyTorch version you are currently using. "
+            "Please ensure that your DGL installation is compatible with your "
+            "PyTorch version. For more information, refer to the installation "
+            "guide at https://www.dgl.ai/pages/start.html."
         )
 
     try:

From fcbdcceb9e209463a35d24013ea15aac4e5567f6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 15 Aug 2024 13:56:48 -0400
Subject: [PATCH 22/78] [GraphBolt][CUDA] Eliminate GPUCache synchronization.
 (#7705)

---
 graphbolt/src/cuda/extension/gpu_cache.cu     |  8 +++++
 graphbolt/src/cuda/extension/gpu_cache.h      |  4 +++
 graphbolt/src/python_binding.cc               |  1 +
 python/dgl/graphbolt/impl/gpu_cache.py        | 32 ++++++++++++++++---
 .../dgl/graphbolt/impl/gpu_cached_feature.py  |  8 +++--
 5 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/graphbolt/src/cuda/extension/gpu_cache.cu b/graphbolt/src/cuda/extension/gpu_cache.cu
index 8abe5eec71f5..7e280187976a 100644
--- a/graphbolt/src/cuda/extension/gpu_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_cache.cu
@@ -76,6 +76,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
   return std::make_tuple(values, missing_index, missing_keys);
 }
 
+c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> GpuCache::QueryAsync(
+    torch::Tensor keys) {
+  return async([=] {
+    auto [values, missing_index, missing_keys] = Query(keys);
+    return std::vector{values, missing_index, missing_keys};
+  });
+}
+
 void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
   TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device.");
   TORCH_CHECK(
diff --git a/graphbolt/src/cuda/extension/gpu_cache.h b/graphbolt/src/cuda/extension/gpu_cache.h
index 556bdaa5b5bd..6ca2b12995a7 100644
--- a/graphbolt/src/cuda/extension/gpu_cache.h
+++ b/graphbolt/src/cuda/extension/gpu_cache.h
@@ -21,6 +21,7 @@
 #ifndef GRAPHBOLT_GPU_CACHE_H_
 #define GRAPHBOLT_GPU_CACHE_H_
 
+#include <graphbolt/async.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
 
@@ -53,6 +54,9 @@ class GpuCache : public torch::CustomClassHolder {
   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Query(
       torch::Tensor keys);
 
+  c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAsync(
+      torch::Tensor keys);
+
   void Replace(torch::Tensor keys, torch::Tensor values);
 
   static c10::intrusive_ptr<GpuCache> Create(
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 9e017dd1df3d..62822c28a478 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -109,6 +109,7 @@ TORCH_LIBRARY(graphbolt, m) {
 #ifdef GRAPHBOLT_USE_CUDA
   m.class_<cuda::GpuCache>("GpuCache")
       .def("query", &cuda::GpuCache::Query)
+      .def("query_async", &cuda::GpuCache::QueryAsync)
       .def("replace", &cuda::GpuCache::Replace);
   m.def("gpu_cache", &cuda::GpuCache::Create);
   m.class_<cuda::GpuGraphCache>("GpuGraphCache")
diff --git a/python/dgl/graphbolt/impl/gpu_cache.py b/python/dgl/graphbolt/impl/gpu_cache.py
index 7c07e7c52a0b..413fa5527a7a 100644
--- a/python/dgl/graphbolt/impl/gpu_cache.py
+++ b/python/dgl/graphbolt/impl/gpu_cache.py
@@ -14,13 +14,16 @@ def __init__(self, cache_shape, dtype):
         self.total_miss = 0
         self.total_queries = 0
 
-    def query(self, keys):
+    def query(self, keys, async_op=False):
         """Queries the GPU cache.
 
         Parameters
         ----------
         keys : Tensor
             The keys to query the GPU cache with.
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -29,10 +32,29 @@ def query(self, keys):
             values[missing_indices] corresponds to cache misses that should be
             filled by quering another source with missing_keys.
         """
-        self.total_queries += keys.shape[0]
-        values, missing_index, missing_keys = self._cache.query(keys)
-        self.total_miss += missing_keys.shape[0]
-        return values, missing_index, missing_keys
+
+        class _Waiter:
+            def __init__(self, gpu_cache, future):
+                self.gpu_cache = gpu_cache
+                self.future = future
+
+            def wait(self):
+                """Returns the stored value when invoked."""
+                gpu_cache = self.gpu_cache
+                values, missing_index, missing_keys = (
+                    self.future.wait() if async_op else self.future
+                )
+                # Ensure there is no leak.
+                self.gpu_cache = self.future = None
+
+                gpu_cache.total_queries += values.shape[0]
+                gpu_cache.total_miss += missing_keys.shape[0]
+                return values, missing_index, missing_keys
+
+        if async_op:
+            return _Waiter(self, self._cache.query_async(keys))
+        else:
+            return _Waiter(self, self._cache.query(keys)).wait()
 
     def replace(self, keys, values):
         """Inserts key-value pairs into the GPU cache using the Least-Recently
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index d8fb36add1da..e19c8752fa2a 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -114,7 +114,11 @@ def read_async(self, ids: torch.Tensor):
         >>> assert stage + 1 == feature.read_async_num_stages(ids.device)
         >>> result = future.wait()  # result contains the read values.
         """
-        values, missing_index, missing_keys = self._feature.query(ids)
+        future = self._feature.query(ids, async_op=True)
+
+        yield
+
+        values, missing_index, missing_keys = future.wait()
 
         fallback_reader = self._fallback_feature.read_async(missing_keys)
         fallback_num_stages = self._fallback_feature.read_async_num_stages(
@@ -175,7 +179,7 @@ def read_async_num_stages(self, ids_device: torch.device):
             The number of stages of the read_async operation.
         """
         assert ids_device.type == "cuda"
-        return self._fallback_feature.read_async_num_stages(ids_device)
+        return 1 + self._fallback_feature.read_async_num_stages(ids_device)
 
     def size(self):
         """Get the size of the feature.

From db574f5b0b4a606539b0b1437d0e53c4a124b1d1 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 15 Aug 2024 14:48:15 -0400
Subject: [PATCH 23/78] [GraphBolt][CUDA] Enable asynchronous sampling for
 multigpu example. (#7706)

---
 examples/multigpu/graphbolt/node_classification.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index 35ae6fcc38d4..6c5ce52942dc 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -135,7 +135,10 @@ def create_dataloader(
     if args.storage_device != "cpu":
         datapipe = datapipe.copy_to(device)
     datapipe = datapipe.sample_neighbor(
-        graph, args.fanout, overlap_fetch=args.storage_device == "pinned"
+        graph,
+        args.fanout,
+        overlap_fetch=args.storage_device == "pinned",
+        asynchronous=args.storage_device != "cpu",
     )
     datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
     if args.storage_device == "cpu":

From 9c874d0219bf02561340032836ccfd806cf5048a Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 15 Aug 2024 20:39:58 -0400
Subject: [PATCH 24/78] [GraphBolt][CUDA] Use async for GPUGraphCache. (#7707)

---
 .../src/cuda/extension/gpu_graph_cache.cu     | 22 +++++++++
 .../src/cuda/extension/gpu_graph_cache.h      | 16 +++++-
 graphbolt/src/python_binding.cc               | 15 +++++-
 python/dgl/graphbolt/impl/gpu_graph_cache.py  | 39 +++++++++++++++
 python/dgl/graphbolt/impl/neighbor_sampler.py | 49 ++++++++++++-------
 .../pytorch/graphbolt/test_dataloader.py      |  4 ++
 6 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 3d3e403b1365..0aec5949343d 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -25,6 +25,7 @@
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
+#include <limits>
 #include <numeric>
 #include <type_traits>
 
@@ -168,6 +169,7 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
       seeds.device().index() == device_id_,
       "Seeds should be on the correct CUDA device.");
   TORCH_CHECK(seeds.sizes().size() == 1, "Keys should be a 1D tensor.");
+  std::lock_guard lock(mtx_);
   auto allocator = cuda::GetAllocator();
   auto index_dtype = cached_edge_tensors_.at(0).scalar_type();
   const dim3 block(kIntBlockSize);
@@ -237,6 +239,12 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
       }));
 }
 
+c10::intrusive_ptr<
+    Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>>
+GpuGraphCache::QueryAsync(torch::Tensor seeds) {
+  return async([=] { return Query(seeds); });
+}
+
 std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
     torch::Tensor seeds, torch::Tensor indices, torch::Tensor positions,
     int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
@@ -250,6 +258,7 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
   TORCH_CHECK(
       indptr.size(0) == num_nodes - num_hit + 1,
       "(indptr.size(0) == seeds.size(0) - num_hit + 1) failed.");
+  std::lock_guard lock(mtx_);
   const int64_t num_buffers = num_nodes * num_tensors;
   auto allocator = cuda::GetAllocator();
   auto index_dtype = cached_edge_tensors_.at(0).scalar_type();
@@ -490,5 +499,18 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
       }));
 }
 
+c10::intrusive_ptr<
+    Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>>
+GpuGraphCache::ReplaceAsync(
+    torch::Tensor seeds, torch::Tensor indices, torch::Tensor positions,
+    int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
+    std::vector<torch::Tensor> edge_tensors) {
+  return async([=] {
+    return Replace(
+        seeds, indices, positions, num_hit, num_threshold, indptr,
+        edge_tensors);
+  });
+}
+
 }  // namespace cuda
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.h b/graphbolt/src/cuda/extension/gpu_graph_cache.h
index 74309cb4ab32..0708f5d00917 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.h
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.h
@@ -21,11 +21,11 @@
 #ifndef GRAPHBOLT_GPU_GRAPH_CACHE_H_
 #define GRAPHBOLT_GPU_GRAPH_CACHE_H_
 
+#include <graphbolt/async.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
 
-#include <limits>
-#include <type_traits>
+#include <mutex>
 
 namespace graphbolt {
 namespace cuda {
@@ -69,6 +69,10 @@ class GpuGraphCache : public torch::CustomClassHolder {
   std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> Query(
       torch::Tensor seeds);
 
+  c10::intrusive_ptr<
+      Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>>
+  QueryAsync(torch::Tensor seeds);
+
   /**
    * @brief After the graph structure for the missing node ids are fetched, it
    * inserts the node ids which passes the threshold and returns the final
@@ -96,6 +100,13 @@ class GpuGraphCache : public torch::CustomClassHolder {
       int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
       std::vector<torch::Tensor> edge_tensors);
 
+  c10::intrusive_ptr<
+      Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>>
+  ReplaceAsync(
+      torch::Tensor seeds, torch::Tensor indices, torch::Tensor positions,
+      int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
+      std::vector<torch::Tensor> edge_tensors);
+
   static c10::intrusive_ptr<GpuGraphCache> Create(
       const int64_t num_edges, const int64_t threshold,
       torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes);
@@ -111,6 +122,7 @@ class GpuGraphCache : public torch::CustomClassHolder {
   torch::Tensor offset_;          // The original graph's sliced_indptr tensor.
   std::vector<torch::Tensor> cached_edge_tensors_;  // The cached graph
                                                     // structure edge tensors.
+  std::mutex mtx_;  // Protects the data structure and makes it threadsafe.
 };
 
 }  // namespace cuda
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 62822c28a478..df295bd718b7 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -58,6 +58,17 @@ TORCH_LIBRARY(graphbolt, m) {
           "wait",
           &Future<std::vector<
               std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>::Wait);
+  m.class_<Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>>(
+       "GpuGraphCacheQueryFuture")
+      .def(
+          "wait",
+          &Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>::
+              Wait);
+  m.class_<Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>>(
+       "GpuGraphCacheReplaceFuture")
+      .def(
+          "wait",
+          &Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>::Wait);
   m.class_<storage::OnDiskNpyArray>("OnDiskNpyArray")
       .def("index_select", &storage::OnDiskNpyArray::IndexSelect);
   m.class_<FusedCSCSamplingGraph>("FusedCSCSamplingGraph")
@@ -114,7 +125,9 @@ TORCH_LIBRARY(graphbolt, m) {
   m.def("gpu_cache", &cuda::GpuCache::Create);
   m.class_<cuda::GpuGraphCache>("GpuGraphCache")
       .def("query", &cuda::GpuGraphCache::Query)
-      .def("replace", &cuda::GpuGraphCache::Replace);
+      .def("query_async", &cuda::GpuGraphCache::QueryAsync)
+      .def("replace", &cuda::GpuGraphCache::Replace)
+      .def("replace_async", &cuda::GpuGraphCache::ReplaceAsync);
   m.def("gpu_graph_cache", &cuda::GpuGraphCache::Create);
 #endif
   m.def("fused_csc_sampling_graph", &FusedCSCSamplingGraph::Create);
diff --git a/python/dgl/graphbolt/impl/gpu_graph_cache.py b/python/dgl/graphbolt/impl/gpu_graph_cache.py
index 502e72a11071..e4cf78b589af 100644
--- a/python/dgl/graphbolt/impl/gpu_graph_cache.py
+++ b/python/dgl/graphbolt/impl/gpu_graph_cache.py
@@ -68,6 +68,45 @@ def replace_functional(missing_indptr, missing_edge_tensors):
 
         return keys[index[num_hit:]], replace_functional
 
+    def query_async(self, keys):
+        """Queries the GPU cache asynchronously.
+
+        Parameters
+        ----------
+        keys : Tensor
+            The keys to query the GPU graph cache with.
+
+        Returns
+        -------
+        A generator object.
+            The returned generator object returns the missing keys on the second
+            invocation and expects the fetched indptr and edge tensors on the
+            next invocation. The third and last invocation returns a future
+            object and the return result can be accessed by calling `.wait()`
+            on the returned future object. It is undefined behavior to call
+            `.wait()` more than once.
+        """
+        future = self._cache.query_async(keys)
+
+        yield
+
+        index, position, num_hit, num_threshold = future.wait()
+
+        self.total_queries += keys.shape[0]
+        self.total_miss += keys.shape[0] - num_hit
+
+        missing_indptr, missing_edge_tensors = yield keys[index[num_hit:]]
+
+        yield self._cache.replace_async(
+            keys,
+            index,
+            position,
+            num_hit,
+            num_threshold,
+            missing_indptr,
+            missing_edge_tensors,
+        )
+
     @property
     def miss_rate(self):
         """Returns the cache miss rate since creation."""
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index b11f79ad5ae3..7b12b5b2d16f 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -32,18 +32,25 @@
 
 @functional_datapipe("fetch_cached_insubgraph_data")
 class FetchCachedInsubgraphData(Mapper):
-    """Queries the GPUGraphCache and returns the missing seeds and a lambda
-    function that can be called with the fetched graph structure.
+    """Queries the GPUGraphCache and returns the missing seeds and a generator
+    handle that can be called with the fetched graph structure.
     """
 
     def __init__(self, datapipe, gpu_graph_cache):
-        super().__init__(datapipe, self._fetch_per_layer)
+        datapipe = datapipe.transform(self._fetch_per_layer).buffer()
+        super().__init__(datapipe, self._wait_query_future)
         self.cache = gpu_graph_cache
 
     def _fetch_per_layer(self, minibatch):
-        minibatch._seeds, minibatch._replace = self.cache.query(
-            minibatch._seeds
-        )
+        minibatch._async_handle = self.cache.query_async(minibatch._seeds)
+        # Start first stage
+        next(minibatch._async_handle)
+
+        return minibatch
+
+    @staticmethod
+    def _wait_query_future(minibatch):
+        minibatch._seeds = next(minibatch._async_handle)
 
         return minibatch
 
@@ -55,7 +62,8 @@ class CombineCachedAndFetchedInSubgraph(Mapper):
     """
 
     def __init__(self, datapipe, prob_name):
-        super().__init__(datapipe, self._combine_per_layer)
+        datapipe = datapipe.transform(self._combine_per_layer).buffer()
+        super().__init__(datapipe, self._wait_replace_future)
         self.prob_name = prob_name
 
     def _combine_per_layer(self, minibatch):
@@ -69,16 +77,24 @@ def _combine_per_layer(self, minibatch):
             edge_tensors.append(probs_or_mask)
         edge_tensors.append(subgraph.edge_attribute(ORIGINAL_EDGE_ID))
 
-        subgraph.csc_indptr, edge_tensors = minibatch._replace(
-            subgraph.csc_indptr, edge_tensors
+        minibatch._future = minibatch._async_handle.send(
+            (subgraph.csc_indptr, edge_tensors)
         )
-        delattr(minibatch, "_replace")
+        delattr(minibatch, "_async_handle")
+
+        return minibatch
+
+    def _wait_replace_future(self, minibatch):
+        subgraph = minibatch._sliced_sampling_graph
+        subgraph.csc_indptr, edge_tensors = minibatch._future.wait()
+        delattr(minibatch, "_future")
 
         subgraph.indices = edge_tensors[0]
         edge_tensors = edge_tensors[1:]
         if subgraph.type_per_edge is not None:
             subgraph.type_per_edge = edge_tensors[0]
             edge_tensors = edge_tensors[1:]
+        probs_or_mask = subgraph.edge_attribute(self.prob_name)
         if probs_or_mask is not None:
             subgraph.add_edge_attribute(self.prob_name, edge_tensors[0])
             edge_tensors = edge_tensors[1:]
@@ -113,7 +129,7 @@ def _concat(self, minibatch):
 
 
 @functional_datapipe("fetch_insubgraph_data")
-class FetchInsubgraphData(Mapper):
+class FetchInsubgraphData(MiniBatchTransformer):
     """Fetches the insubgraph and wraps it in a FusedCSCSamplingGraph object. If
     the provided sample_per_layer_obj has a valid prob_name, then it reads the
     probabilies of all the fetched edges. Furthermore, if type_per_array tensor
@@ -131,9 +147,13 @@ def __init__(
             datapipe = datapipe.fetch_cached_insubgraph_data(
                 graph._gpu_graph_cache
             )
+        datapipe = datapipe.transform(self._fetch_per_layer)
+        datapipe = datapipe.buffer().wait()
+        if graph._gpu_graph_cache is not None:
+            datapipe = datapipe.combine_cached_and_fetched_insubgraph(prob_name)
+        super().__init__(datapipe)
         self.graph = graph
         self.prob_name = prob_name
-        super().__init__(datapipe, self._fetch_per_layer)
 
     def _fetch_per_layer(self, minibatch):
         stream = torch.cuda.current_stream()
@@ -260,11 +280,6 @@ def __init__(
             self.returning_indices_is_optional = True
         elif overlap_fetch:
             datapipe = datapipe.fetch_insubgraph_data(graph, prob_name)
-            datapipe = datapipe.buffer().wait()
-            if graph._gpu_graph_cache is not None:
-                datapipe = datapipe.combine_cached_and_fetched_insubgraph(
-                    prob_name
-                )
             datapipe = datapipe.transform(
                 self._sample_per_layer_from_fetched_subgraph
             )
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index d727fc2300fe..4c8cc5b3f4f6 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -138,6 +138,10 @@ def test_gpu_sampling_DataLoader(
         awaiter_cnt += num_layers
     if asynchronous:
         bufferer_cnt += 2 * num_layers
+    if overlap_graph_fetch:
+        bufferer_cnt += 0 * num_layers
+        if num_gpu_cached_edges > 0:
+            bufferer_cnt += 2 * num_layers
     datapipe = dataloader.dataset
     datapipe_graph = traverse_dps(datapipe)
     awaiters = find_dps(

From 1eb0f9c116a30fd0b2f408343519087e106924b6 Mon Sep 17 00:00:00 2001
From: "Hongzhi (Steve), Chen" <chenhongzhi.nkcs@gmail.com>
Date: Fri, 16 Aug 2024 12:09:27 +0800
Subject: [PATCH 25/78] Clarify the docstring for original_edge_ids in
 fused_sampled_subgraph. (#7703)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
---
 .../include/graphbolt/fused_sampled_subgraph.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/graphbolt/include/graphbolt/fused_sampled_subgraph.h b/graphbolt/include/graphbolt/fused_sampled_subgraph.h
index 13f6eb452c44..98117777bbd4 100644
--- a/graphbolt/include/graphbolt/fused_sampled_subgraph.h
+++ b/graphbolt/include/graphbolt/fused_sampled_subgraph.h
@@ -49,7 +49,8 @@ struct FusedSampledSubgraph : torch::CustomClassHolder {
    * graph.
    * @param original_row_node_ids Column's reverse node ids in the original
    * graph.
-   * @param original_edge_ids Reverse edge ids in the original graph.
+   * @param original_edge_ids Mapping of subgraph edge IDs to original
+   * FusedCSCSamplingGraph edge IDs.
    * @param type_per_edge Type id of each edge.
    * @param etype_offsets Edge offsets for the sampled edges for the sampled
    * edges that are sorted w.r.t. edge types.
@@ -91,10 +92,17 @@ struct FusedSampledSubgraph : torch::CustomClassHolder {
   torch::optional<torch::Tensor> indices;
 
   /**
-   * @brief Reverse edge ids in the original graph, the edge with id
-   * `original_edge_ids[i]` in the original graph is mapped to `i` in this
-   * subgraph. This is useful when edge features are needed. The edges are
-   * sorted w.r.t. their edge types for the heterogenous case.
+   * @brief Mapping of subgraph edge IDs to original FusedCSCSamplingGraph
+   * edge IDs.
+   *
+   * In this subgraph, the edge at index i corresponds to the edge with ID
+   * original_edge_ids[i] in the original FusedCSCSamplingGraph. Edges are
+   * sorted by type for heterogeneous graphs.
+   *
+   * Note: To retrieve the actual original edge IDs for feature fetching, use
+   * the `_ORIGINAL_EDGE_ID` edge attribute in FusedCSCSamplingGraph to map the
+   * `original_edge_ids` agin, as IDs may have been remapped during conversion
+   * to FusedCSCSamplingGraph.
    */
   torch::Tensor original_edge_ids;
 

From bca59242960bb8c731a0eec68a640eb9142f0c83 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 16 Aug 2024 00:10:25 -0400
Subject: [PATCH 26/78] [GraphBolt] Rename `sampled_edge_ids` for clarity.
 (#7704)

---
 .../impl/fused_csc_sampling_graph.py          | 25 ++++++++++++-------
 python/dgl/graphbolt/impl/neighbor_sampler.py | 14 ++++++++---
 .../graphbolt/impl/sampled_subgraph_impl.py   | 13 +++++++---
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 79c28853d5d0..75f87d19aa2b 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -599,7 +599,9 @@ def _convert_to_sampled_subgraph(
         indices = C_sampled_subgraph.indices
         type_per_edge = C_sampled_subgraph.type_per_edge
         column = C_sampled_subgraph.original_column_node_ids
-        sampled_edge_ids = C_sampled_subgraph.original_edge_ids
+        edge_ids_in_fused_csc_sampling_graph = (
+            C_sampled_subgraph.original_edge_ids
+        )
         etype_offsets = C_sampled_subgraph.etype_offsets
         if etype_offsets is not None:
             etype_offsets = etype_offsets.tolist()
@@ -610,17 +612,18 @@ def _convert_to_sampled_subgraph(
         )
         original_edge_ids = (
             torch.ops.graphbolt.index_select(
-                self.edge_attributes[ORIGINAL_EDGE_ID], sampled_edge_ids
+                self.edge_attributes[ORIGINAL_EDGE_ID],
+                edge_ids_in_fused_csc_sampling_graph,
             )
             if has_original_eids
-            else sampled_edge_ids
+            else edge_ids_in_fused_csc_sampling_graph
         )
         if type_per_edge is None and etype_offsets is None:
             # The sampled graph is already a homogeneous graph.
             sampled_csc = CSCFormatBase(indptr=indptr, indices=indices)
             if indices is not None:
                 # Only needed to fetch indices.
-                sampled_edge_ids = None
+                edge_ids_in_fused_csc_sampling_graph = None
         else:
             offset = self._node_type_offset_list
 
@@ -660,9 +663,9 @@ def _convert_to_sampled_subgraph(
                         original_hetero_edge_ids[etype] = original_edge_ids[
                             eids
                         ]
-                sampled_hetero_edge_ids = None
+                sampled_hetero_edge_ids_in_fused_csc_sampling_graph = None
             else:
-                sampled_hetero_edge_ids = {}
+                sampled_hetero_edge_ids_in_fused_csc_sampling_graph = {}
                 edge_offsets = [0]
                 for etype, etype_id in self.edge_type_to_id.items():
                     src_ntype, _, dst_ntype = etype_str_to_tuple(etype)
@@ -693,14 +696,18 @@ def _convert_to_sampled_subgraph(
                     ]
                     if indices is None:
                         # Only needed to fetch indices.
-                        sampled_hetero_edge_ids[etype] = sampled_edge_ids[
+                        sampled_hetero_edge_ids_in_fused_csc_sampling_graph[
+                            etype
+                        ] = edge_ids_in_fused_csc_sampling_graph[
                             etype_offsets[etype_id] : etype_offsets[
                                 etype_id + 1
                             ]
                         ]
 
             original_edge_ids = original_hetero_edge_ids
-            sampled_edge_ids = sampled_hetero_edge_ids
+            edge_ids_in_fused_csc_sampling_graph = (
+                sampled_hetero_edge_ids_in_fused_csc_sampling_graph
+            )
             sampled_csc = {
                 etype: CSCFormatBase(
                     indptr=sub_indptr[etype],
@@ -711,7 +718,7 @@ def _convert_to_sampled_subgraph(
         return SampledSubgraphImpl(
             sampled_csc=sampled_csc,
             original_edge_ids=original_edge_ids,
-            _sampled_edge_ids=sampled_edge_ids,
+            _edge_ids_in_fused_csc_sampling_graph=edge_ids_in_fused_csc_sampling_graph,
         )
 
     def sample_neighbors(
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 7b12b5b2d16f..f5bea47bea91 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -357,21 +357,27 @@ def record_stream(tensor):
             if isinstance(subgraph.sampled_csc, dict):
                 for etype, pair in subgraph.sampled_csc.items():
                     if pair.indices is None:
-                        edge_ids = subgraph._sampled_edge_ids[etype]
+                        edge_ids = (
+                            subgraph._edge_ids_in_fused_csc_sampling_graph[
+                                etype
+                            ]
+                        )
                         edge_ids.record_stream(torch.cuda.current_stream())
                         pair.indices = record_stream(
                             index_select(indices, edge_ids)
                         )
                         minibatch._indices_needs_offset_subtraction = True
             elif subgraph.sampled_csc.indices is None:
-                subgraph._sampled_edge_ids.record_stream(
+                subgraph._edge_ids_in_fused_csc_sampling_graph.record_stream(
                     torch.cuda.current_stream()
                 )
                 subgraph.sampled_csc.indices = record_stream(
-                    index_select(indices, subgraph._sampled_edge_ids)
+                    index_select(
+                        indices, subgraph._edge_ids_in_fused_csc_sampling_graph
+                    )
                 )
                 minibatch._indices_needs_offset_subtraction = True
-            subgraph._sampled_edge_ids = None
+            subgraph._edge_ids_in_fused_csc_sampling_graph = None
             minibatch.wait = torch.cuda.current_stream().record_event().wait
 
         return minibatch
diff --git a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
index 0234bec46b14..47631974c50d 100644
--- a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
+++ b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
@@ -46,7 +46,9 @@ class SampledSubgraphImpl(SampledSubgraph):
     original_row_node_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
     original_edge_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
     # Used to fetch sampled_csc.indices if it is missing.
-    _sampled_edge_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
+    _edge_ids_in_fused_csc_sampling_graph: Union[
+        Dict[str, torch.Tensor], torch.Tensor
+    ] = None
 
     def __post_init__(self):
         if isinstance(self.sampled_csc, dict):
@@ -65,7 +67,10 @@ def __post_init__(self):
                     ), "Node pair should be have indices of type torch.Tensor."
                 else:
                     assert isinstance(
-                        self._sampled_edge_ids.get(etype, None), torch.Tensor
+                        self._edge_ids_in_fused_csc_sampling_graph.get(
+                            etype, None
+                        ),
+                        torch.Tensor,
                     ), "When indices is missing, sampled edge ids needs to be provided."
         else:
             assert self.sampled_csc.indptr is not None and isinstance(
@@ -81,7 +86,7 @@ def __post_init__(self):
                 ), "Node pair should have a torch.Tensor indices."
             else:
                 assert isinstance(
-                    self._sampled_edge_ids, torch.Tensor
+                    self._edge_ids_in_fused_csc_sampling_graph, torch.Tensor
                 ), "When indices is missing, sampled edge ids needs to be provided."
 
     def __repr__(self) -> str:
@@ -95,7 +100,7 @@ def _sampled_subgraph_str(sampled_subgraph: SampledSubgraph, classname) -> str:
     attributes.reverse()
 
     for name in attributes:
-        if name in "_sampled_edge_ids":
+        if name in "_edge_ids_in_fused_csc_sampling_graph":
             continue
         val = getattr(sampled_subgraph, name)
 

From 921fb15f14b46886b1a265131f76abf27f4aa25f Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 16 Aug 2024 00:26:32 -0400
Subject: [PATCH 27/78] [GraphBolt] Refactor datapipes folder and add
 `to_graph`. (#7688)

---
 python/dgl/graphbolt/__init__.py              |   1 +
 python/dgl/graphbolt/dataloader.py            |   6 +-
 python/dgl/graphbolt/datapipes/__init__.py    |   3 +
 .../datapipe_utils.py => datapipes/utils.py}  |   0
 .../dgl/graphbolt/datapipes/visualization.py  | 206 ++++++++++++++++++
 python/dgl/graphbolt/internal/__init__.py     |   1 -
 .../pytorch/graphbolt/test_dataloader.py      |   2 +-
 7 files changed, 214 insertions(+), 5 deletions(-)
 create mode 100644 python/dgl/graphbolt/datapipes/__init__.py
 rename python/dgl/graphbolt/{internal/datapipe_utils.py => datapipes/utils.py} (100%)
 create mode 100644 python/dgl/graphbolt/datapipes/visualization.py

diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
index 7506e24c6f54..398e1367acb4 100644
--- a/python/dgl/graphbolt/__init__.py
+++ b/python/dgl/graphbolt/__init__.py
@@ -84,6 +84,7 @@ def load_graphbolt():
 from .base import *
 from .minibatch import *
 from .dataloader import *
+from .datapipes import *
 from .dataset import *
 from .feature_fetcher import *
 from .feature_store import *
diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index d76cb48fa0db..8b00faba2cca 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -4,15 +4,15 @@
 import torch.utils.data as torch_data
 
 from .base import CopyTo
-from .feature_fetcher import FeatureFetcher, FeatureFetcherStartMarker
-from .impl.neighbor_sampler import SamplePerLayer
 
-from .internal import (
+from .datapipes import (
     datapipe_graph_to_adjlist,
     find_dps,
     replace_dp,
     traverse_dps,
 )
+from .feature_fetcher import FeatureFetcher, FeatureFetcherStartMarker
+from .impl.neighbor_sampler import SamplePerLayer
 from .internal_utils import gb_warning
 from .item_sampler import ItemSampler
 
diff --git a/python/dgl/graphbolt/datapipes/__init__.py b/python/dgl/graphbolt/datapipes/__init__.py
new file mode 100644
index 000000000000..e29d6b6a5532
--- /dev/null
+++ b/python/dgl/graphbolt/datapipes/__init__.py
@@ -0,0 +1,3 @@
+"""GraphBolt's datapipes, mostly copied from "torchdata==0.7.1"."""
+from .utils import *
+from .visualization import *
diff --git a/python/dgl/graphbolt/internal/datapipe_utils.py b/python/dgl/graphbolt/datapipes/utils.py
similarity index 100%
rename from python/dgl/graphbolt/internal/datapipe_utils.py
rename to python/dgl/graphbolt/datapipes/utils.py
diff --git a/python/dgl/graphbolt/datapipes/visualization.py b/python/dgl/graphbolt/datapipes/visualization.py
new file mode 100644
index 000000000000..014109efa87b
--- /dev/null
+++ b/python/dgl/graphbolt/datapipes/visualization.py
@@ -0,0 +1,206 @@
+# pylint: disable=W,C,R
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Original source:
+# https://github.com/pytorch/data/blob/v0.7.1/torchdata/datapipes/utils/_visualization.py
+
+import itertools
+from collections import defaultdict
+
+from typing import Optional, Set, TYPE_CHECKING
+
+from torch.utils.data.datapipes.iter.combining import _ChildDataPipe
+
+from .utils import IterDataPipe, traverse_dps
+
+if TYPE_CHECKING:
+    import graphviz
+
+
+__all__ = [
+    "to_graph",
+]
+
+
+class Node:
+    def __init__(self, dp, *, name=None):
+        self.dp = dp
+        self.name = name or type(dp).__name__.replace("IterDataPipe", "")
+        self.childs = set()
+        self.parents = set()
+
+    def add_child(self, child):
+        self.childs.add(child)
+        child.parents.add(self)
+
+    def remove_child(self, child):
+        self.childs.remove(child)
+        child.parents.remove(self)
+
+    def add_parent(self, parent):
+        self.parents.add(parent)
+        parent.childs.add(self)
+
+    def remove_parent(self, parent):
+        self.parents.remove(parent)
+        parent.childs.remove(self)
+
+    def __eq__(self, other):
+        if not isinstance(other, Node):
+            return NotImplemented
+
+        return hash(self) == hash(other)
+
+    def __hash__(self):
+        return hash(self.dp)
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return f"{self}-{hash(self)}"
+
+
+def to_nodes(dp, *, debug: bool) -> Set[Node]:
+    def recurse(dp_graph, child=None):
+        for _dp_id, (dp_node, dp_parents) in dp_graph.items():
+            node = Node(dp_node)
+            if child is not None:
+                node.add_child(child)
+            yield node
+            yield from recurse(dp_parents, child=node)
+
+    def aggregate(nodes):
+        groups = defaultdict(list)
+        for node in nodes:
+            groups[node].append(node)
+
+        nodes = set()
+        for node, group in groups.items():
+            if len(group) == 1:
+                nodes.add(node)
+                continue
+
+            aggregated_node = Node(node.dp)
+
+            for duplicate_node in group:
+                for child in duplicate_node.childs.copy():
+                    duplicate_node.remove_child(child)
+                    aggregated_node.add_child(child)
+
+                for parent in duplicate_node.parents.copy():
+                    duplicate_node.remove_parent(parent)
+                    aggregated_node.add_parent(parent)
+
+            nodes.add(aggregated_node)
+
+        if debug:
+            return nodes
+
+        child_dp_nodes = set(
+            itertools.chain.from_iterable(
+                node.parents
+                for node in nodes
+                if isinstance(node.dp, _ChildDataPipe)
+            )
+        )
+
+        if not child_dp_nodes:
+            return nodes
+
+        for node in child_dp_nodes:
+            fixed_parent_node = Node(
+                type(
+                    str(node).lstrip("_"),
+                    (IterDataPipe,),
+                    dict(dp=node.dp, childs=node.childs),
+                )()
+            )
+            nodes.remove(node)
+            nodes.add(fixed_parent_node)
+
+            for parent in node.parents.copy():
+                node.remove_parent(parent)
+                fixed_parent_node.add_parent(parent)
+
+            for child in node.childs:
+                nodes.remove(child)
+                for actual_child in child.childs.copy():
+                    actual_child.remove_parent(child)
+                    actual_child.add_parent(fixed_parent_node)
+
+        return nodes
+
+    return aggregate(recurse(traverse_dps(dp)))
+
+
+def to_graph(dp, *, debug: bool = False) -> "graphviz.Digraph":
+    """Visualizes a DataPipe by returning a :class:`graphviz.Digraph`, which is a graph of the data pipeline.
+    This allows you to visually inspect all the transformation that takes place in your DataPipes.
+
+    .. note::
+
+        The package :mod:`graphviz` is required to use this function.
+
+    .. note::
+
+        The most common interfaces for the returned graph object are:
+
+        - :meth:`~graphviz.Digraph.render`: Save the graph to a file.
+        - :meth:`~graphviz.Digraph.view`: Open the graph in a viewer.
+
+    Args:
+        dp: DataPipe that you would like to visualize (generally the last one in a chain of DataPipes).
+        debug (bool): If ``True``, renders internal datapipes that are usually hidden from the user
+            (such as ``ChildDataPipe`` of `demux` and `fork`). Defaults to ``False``.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> from torchdata.datapipes.utils import to_graph
+        >>> dp = IterableWrapper(range(10))
+        >>> dp1, dp2 = dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
+        >>> dp1 = dp1.map(lambda x: x + 1)
+        >>> dp2 = dp2.filter(lambda _: True)
+        >>> dp3 = dp1.zip(dp2).map(lambda t: t[0] + t[1])
+        >>> g = to_graph(dp3)
+        >>> g.view()  # This will open the graph in a viewer
+    """
+    try:
+        import graphviz
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError(
+            "The package `graphviz` is required to be installed to use this function. "
+            "Please `pip install graphviz` or `conda install -c conda-forge graphviz`."
+        ) from None
+
+    # The graph style as well as the color scheme below was copied from https://github.com/szagoruyko/pytorchviz/
+    # https://github.com/szagoruyko/pytorchviz/blob/0adcd83af8aa7ab36d6afd139cabbd9df598edb7/torchviz/dot.py#L78-L85
+    node_attr = dict(
+        style="filled",
+        shape="box",
+        align="left",
+        fontsize="10",
+        ranksep="0.1",
+        height="0.2",
+        fontname="monospace",
+    )
+    graph = graphviz.Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
+
+    for node in to_nodes(dp, debug=debug):
+        fillcolor: Optional[str]
+        if not node.parents:
+            fillcolor = "lightblue"
+        elif not node.childs:
+            fillcolor = "darkolivegreen1"
+        else:
+            fillcolor = None
+
+        graph.node(name=repr(node), label=str(node), fillcolor=fillcolor)
+
+        for child in node.childs:
+            graph.edge(repr(node), repr(child))
+
+    return graph
diff --git a/python/dgl/graphbolt/internal/__init__.py b/python/dgl/graphbolt/internal/__init__.py
index 8e3ed940797f..4fdd28753bb4 100644
--- a/python/dgl/graphbolt/internal/__init__.py
+++ b/python/dgl/graphbolt/internal/__init__.py
@@ -1,5 +1,4 @@
 """Utility functions for GraphBolt."""
 from .utils import *
 from .sample_utils import *
-from .datapipe_utils import *
 from .item_sampler_utils import *
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 4c8cc5b3f4f6..85e034b123f5 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -7,7 +7,7 @@
 import pytest
 import torch
 
-from dgl.graphbolt.internal import find_dps, traverse_dps
+from dgl.graphbolt.datapipes import find_dps, traverse_dps
 
 from . import gb_test_utils
 

From 4910eace8c2076ed81a464052dd045a806b9c680 Mon Sep 17 00:00:00 2001
From: "Hongzhi (Steve), Chen" <chenhongzhi.nkcs@gmail.com>
Date: Fri, 16 Aug 2024 15:29:56 +0800
Subject: [PATCH 28/78] Revert "[Build] Organize cmake file." (#7710)

---
 CMakeLists.txt           | 87 +++++++++++++++++++++++-----------------
 cmake/modules/CUDA.cmake |  4 --
 2 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 688f78218b5d..ce239dbf0f18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -129,11 +129,11 @@ if (${BUILD_TYPE} STREQUAL "dev")
   endif()
 else()
   if (MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /DNDEBUG")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /DNDEBUG")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2")
   else()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -DNDEBUG")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
   endif()
 endif()
 
@@ -186,11 +186,48 @@ else(MSVC)
   endif(NOT APPLE)
 endif(MSVC)
 
+if(USE_OPENMP)
+  include(FindOpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
+  endif(OPENMP_FOUND)
+  message(STATUS "Build with OpenMP.")
+endif(USE_OPENMP)
+
 if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   message(STATUS "Disabling LIBXSMM on ${CMAKE_SYSTEM_PROCESSOR}.")
   set(USE_LIBXSMM OFF)
 endif()
 
+if(USE_LIBXSMM)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
+  message(STATUS "Build with LIBXSMM optimization.")
+endif(USE_LIBXSMM)
+
+if ((NOT MSVC) AND USE_EPOLL)
+  INCLUDE(CheckIncludeFile)
+  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
+  if (EPOLL_AVAILABLE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_EPOLL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_EPOLL")
+  else()
+    message(WARNING "EPOLL is not available on this platform...")
+  endif()
+endif ()
+
+# To compile METIS correct for DGL.
+if(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+else(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+endif(MSVC)
+
+# configure minigun
+add_definitions(-DENABLE_PARTIAL_FRONTIER=0)  # disable minigun partial frontier compile
 # Source file lists
 file(GLOB DGL_SRC
   src/*.cc
@@ -219,12 +256,6 @@ else()
 endif()
 list(APPEND DGL_SRC ${DGL_RPC_SRC})
 
-if(USE_OPENMP)
-  find_package(OpenMP REQUIRED)
-  list(APPEND DGL_LINKER_LIBS OpenMP::OpenMP_CXX)
-  message(STATUS "Build with OpenMP.")
-endif(USE_OPENMP)
-
 # Configure cuda
 if(USE_CUDA)
   file(GLOB_RECURSE DGL_CUDA_SRC
@@ -248,16 +279,6 @@ else(USE_CUDA)
   add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)
 
-if ((NOT MSVC) AND USE_EPOLL)
-  INCLUDE(CheckIncludeFile)
-  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
-  if (EPOLL_AVAILABLE)
-    target_compile_definitions(dgl PRIVATE USE_EPOLL)
-  else()
-    message(WARNING "EPOLL is not available on this platform...")
-  endif()
-endif ()
-
 # include directories
 target_include_directories(dgl PRIVATE "include")
 # check for conda includes
@@ -330,26 +351,18 @@ else(EXTERNAL_NANOFLANN_PATH)
 endif(EXTERNAL_NANOFLANN_PATH)
 
 if (USE_LIBXSMM)
-  target_compile_definitions(dgl PRIVATE USE_LIBXSMM DGL_CPU_LLC_SIZE=40000000 __BLAS=0)
   target_include_directories(dgl PRIVATE "third_party/libxsmm/include")
-  message(STATUS "Build with LIBXSMM optimization.")
 endif()
 
-# To compile METIS correct for DGL.
-add_compile_definitions(IDXTYPEWIDTH=64 REALTYPEWIDTH=32)
 if (EXTERNAL_METIS_PATH)
-  # To compile METIS correct for DGL.
-  if(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-  else(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-  endif(MSVC)
   find_package(METIS REQUIRED)
-  message(STATUS "Found METIS library")
-  target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
-  list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
+  if (NOT METIS_FOUND)
+    message(FATAL_ERROR "Failed to find METIS library")
+  else()
+    message(STATUS "Found METIS library")
+    target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
+    list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
+  endif()
 else(EXTERNAL_METIS_PATH)
   target_include_directories(dgl PRIVATE "third_party/METIS/include")
   # Compile METIS
@@ -378,6 +391,8 @@ endif()
 
 # Compile gpu_cache
 if(USE_CUDA)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
   # Manually build gpu_cache because CMake always builds it as shared
   file(GLOB gpu_cache_src
     third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
@@ -391,7 +406,7 @@ endif(USE_CUDA)
 
 # support PARALLEL_ALGORITHMS
 if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
-  target_compile_definitions(dgl PRIVATE PARALLEL_ALGORITHMS)
+  add_definitions(-DPARALLEL_ALGORITHMS)
 endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 
 target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 2028399f8b66..4d96aab63844 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -230,10 +230,6 @@ macro(dgl_config_cuda linker_libs)
     string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
   endif()
   list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${CXX_HOST_FLAGS}")
-  if(USE_OPENMP)
-    # Needed by CUDA disjoint union source file.
-    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${OpenMP_CXX_FLAGS}")
-  endif(USE_OPENMP)
 
   # 1. Add arch flags
   dgl_select_nvcc_arch_flags(NVCC_FLAGS_ARCH)

From 396f5f1c5d94b0c52e2db6a8b026dd8b3af8a6f6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 16 Aug 2024 17:04:05 -0400
Subject: [PATCH 29/78] [GraphBolt][CUDA] `gb::async` supports alternative
 streams now. (#7712)

---
 graphbolt/include/graphbolt/async.h           | 78 ++++++++++++++++---
 graphbolt/src/cuda/extension/gpu_cache.cu     | 10 ++-
 .../src/cuda/extension/gpu_graph_cache.cu     | 14 ++--
 graphbolt/src/fused_csc_sampling_graph.cc     | 15 ++--
 graphbolt/src/unique_and_compact.cc           |  6 +-
 5 files changed, 93 insertions(+), 30 deletions(-)

diff --git a/graphbolt/include/graphbolt/async.h b/graphbolt/include/graphbolt/async.h
index 0d8f42470068..5aad2a9d14aa 100644
--- a/graphbolt/include/graphbolt/async.h
+++ b/graphbolt/include/graphbolt/async.h
@@ -26,6 +26,7 @@
 #include <future>
 #include <memory>
 #include <mutex>
+#include <variant>
 
 #ifdef BUILD_WITH_TASKFLOW
 #include <taskflow/algorithm/for_each.hpp>
@@ -37,6 +38,7 @@
 #endif
 
 #ifdef GRAPHBOLT_USE_CUDA
+#include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 #include <torch/csrc/api/include/torch/cuda.h>
@@ -92,15 +94,50 @@ inline int get_num_interop_threads() {
 
 template <typename T>
 class Future : public torch::CustomClassHolder {
+#ifdef GRAPHBOLT_USE_CUDA
+  using T_no_event = std::conditional_t<std::is_void_v<T>, std::monostate, T>;
+  using T_with_event = std::conditional_t<
+      std::is_void_v<T>, at::cuda::CUDAEvent,
+      std::pair<T, at::cuda::CUDAEvent>>;
+  using future_type = std::future<std::variant<T_no_event, T_with_event>>;
+#else
+  using future_type = std::future<T>;
+#endif
+
  public:
-  Future(std::future<T>&& future) : future_(std::move(future)) {}
+#ifdef GRAPHBOLT_USE_CUDA
+  using return_type = std::variant<T_no_event, T_with_event>;
+#else
+  using return_type = T;
+#endif
+
+  Future(future_type&& future) : future_(std::move(future)) {}
 
   Future() = default;
 
-  T Wait() { return future_.get(); }
+  T Wait() {
+#ifdef GRAPHBOLT_USE_CUDA
+    auto result = future_.get();
+    if constexpr (std::is_void_v<T>) {
+      if (std::holds_alternative<T_with_event>(result)) {
+        auto&& event = std::get<T_with_event>(result);
+        event.block(c10::cuda::getCurrentCUDAStream());
+      }
+      return;
+    } else if (std::holds_alternative<T_with_event>(result)) {
+      auto&& [value, event] = std::get<T_with_event>(result);
+      event.block(c10::cuda::getCurrentCUDAStream());
+      return value;
+    } else {
+      return std::get<T_no_event>(result);
+    }
+#else
+    return future_.get();
+#endif
+  }
 
  private:
-  std::future<T> future_;
+  future_type future_;
 };
 
 /**
@@ -109,36 +146,55 @@ class Future : public torch::CustomClassHolder {
  * task to avoid spawning a new OpenMP threadpool on each interop thread.
  */
 template <typename F>
-inline auto async(F&& function) {
+inline auto async(F&& function, bool is_cuda = false) {
   using T = decltype(function());
 #ifdef GRAPHBOLT_USE_CUDA
-  const auto is_cuda_available = torch::cuda::is_available();
   struct c10::StreamData3 stream_data;
-  if (is_cuda_available) {
+  if (is_cuda) {
     stream_data = c10::cuda::getCurrentCUDAStream().pack3();
   }
 #endif
-  auto fn = [=, func = std::move(function)] {
+  using return_type = typename Future<T>::return_type;
+  auto fn = [=, func = std::move(function)]() -> return_type {
 #ifdef GRAPHBOLT_USE_CUDA
     // We make sure to use the same CUDA stream as the thread launching the
     // async operation.
-    if (is_cuda_available) {
+    if (is_cuda) {
       auto stream = c10::cuda::CUDAStream::unpack3(
           stream_data.stream_id, stream_data.device_index,
           stream_data.device_type);
       c10::cuda::CUDAStreamGuard guard(stream);
+      at::cuda::CUDAEvent event;
+      // Might be executed on the GPU so we record an event to be able to
+      // synchronize with it later, in case it is executed on an alternative
+      // CUDA stream.
+      if constexpr (std::is_void_v<T>) {
+        func();
+        event.record();
+        return event;
+      } else {
+        auto result = func();
+        event.record();
+        return std::make_pair(std::move(result), std::move(event));
+      }
+    }
+    if constexpr (std::is_void_v<T>) {
+      func();
+      return std::monostate{};
+    } else {
       return func();
     }
-#endif
+#else
     return func();
+#endif
   };
 #ifdef BUILD_WITH_TASKFLOW
   auto future = interop_pool().async(std::move(fn));
 #else
-  auto promise = std::make_shared<std::promise<T>>();
+  auto promise = std::make_shared<std::promise<return_type>>();
   auto future = promise->get_future();
   at::launch([promise, func = std::move(fn)]() {
-    if constexpr (std::is_void_v<T>) {
+    if constexpr (std::is_void_v<return_type>) {
       func();
       promise->set_value();
     } else
diff --git a/graphbolt/src/cuda/extension/gpu_cache.cu b/graphbolt/src/cuda/extension/gpu_cache.cu
index 7e280187976a..fdb0b57f7451 100644
--- a/graphbolt/src/cuda/extension/gpu_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_cache.cu
@@ -78,10 +78,12 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
 
 c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> GpuCache::QueryAsync(
     torch::Tensor keys) {
-  return async([=] {
-    auto [values, missing_index, missing_keys] = Query(keys);
-    return std::vector{values, missing_index, missing_keys};
-  });
+  return async(
+      [=] {
+        auto [values, missing_index, missing_keys] = Query(keys);
+        return std::vector{values, missing_index, missing_keys};
+      },
+      true);
 }
 
 void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 0aec5949343d..80a4bcfd7171 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -242,7 +242,7 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
 c10::intrusive_ptr<
     Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>>
 GpuGraphCache::QueryAsync(torch::Tensor seeds) {
-  return async([=] { return Query(seeds); });
+  return async([=] { return Query(seeds); }, true);
 }
 
 std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
@@ -505,11 +505,13 @@ GpuGraphCache::ReplaceAsync(
     torch::Tensor seeds, torch::Tensor indices, torch::Tensor positions,
     int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
     std::vector<torch::Tensor> edge_tensors) {
-  return async([=] {
-    return Replace(
-        seeds, indices, positions, num_hit, num_threshold, indptr,
-        edge_tensors);
-  });
+  return async(
+      [=] {
+        return Replace(
+            seeds, indices, positions, num_hit, num_threshold, indptr,
+            edge_tensors);
+      },
+      true);
 }
 
 }  // namespace cuda
diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index a2a4778422f7..1e5904bd782d 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -880,12 +880,15 @@ FusedCSCSamplingGraph::SampleNeighborsAsync(
     torch::optional<torch::Tensor> probs_or_mask,
     torch::optional<torch::Tensor> random_seed,
     double seed2_contribution) const {
-  return async([=] {
-    return this->SampleNeighbors(
-        seeds, seed_offsets, fanouts, replace, layer,
-        returning_indices_is_optional, probs_or_mask, random_seed,
-        seed2_contribution);
-  });
+  return async(
+      [=] {
+        return this->SampleNeighbors(
+            seeds, seed_offsets, fanouts, replace, layer,
+            returning_indices_is_optional, probs_or_mask, random_seed,
+            seed2_contribution);
+      },
+      (seeds.has_value() && utils::is_on_gpu(*seeds)) ||
+          utils::is_on_gpu(indptr_));
 }
 
 c10::intrusive_ptr<FusedSampledSubgraph>
diff --git a/graphbolt/src/unique_and_compact.cc b/graphbolt/src/unique_and_compact.cc
index 7dd4a007b4b4..ba07ede8fc71 100644
--- a/graphbolt/src/unique_and_compact.cc
+++ b/graphbolt/src/unique_and_compact.cc
@@ -71,9 +71,9 @@ UniqueAndCompactBatchedAsync(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
     const std::vector<torch::Tensor> unique_dst_ids) {
-  return async([=] {
-    return UniqueAndCompactBatched(src_ids, dst_ids, unique_dst_ids);
-  });
+  return async(
+      [=] { return UniqueAndCompactBatched(src_ids, dst_ids, unique_dst_ids); },
+      utils::is_on_gpu(src_ids.at(0)));
 }
 
 }  // namespace sampling

From 25210816ffbf18788f06b8aaa02fc390c398097e Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 16 Aug 2024 17:42:14 -0400
Subject: [PATCH 30/78] [GraphBolt][CUDA] Eliminate synchronization for
 `overlap_graph_fetch`. (#7709)

---
 graphbolt/src/index_select.cc                 |  14 ++
 graphbolt/src/index_select.h                  |   7 +
 graphbolt/src/python_binding.cc               |   1 +
 python/dgl/graphbolt/impl/neighbor_sampler.py | 135 +++++++++---------
 .../pytorch/graphbolt/test_dataloader.py      |  13 +-
 5 files changed, 91 insertions(+), 79 deletions(-)

diff --git a/graphbolt/src/index_select.cc b/graphbolt/src/index_select.cc
index 114fd6019101..8fdc6a49870e 100644
--- a/graphbolt/src/index_select.cc
+++ b/graphbolt/src/index_select.cc
@@ -207,5 +207,19 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> IndexSelectCSCBatched(
   return std::make_tuple(output_indptr, results);
 }
 
+c10::intrusive_ptr<
+    Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>>
+IndexSelectCSCBatchedAsync(
+    torch::Tensor indptr, std::vector<torch::Tensor> indices_list,
+    torch::Tensor nodes, bool with_edge_ids,
+    torch::optional<int64_t> output_size) {
+  return async(
+      [=] {
+        return IndexSelectCSCBatched(
+            indptr, indices_list, nodes, with_edge_ids, output_size);
+      },
+      utils::is_on_gpu(nodes));
+}
+
 }  // namespace ops
 }  // namespace graphbolt
diff --git a/graphbolt/src/index_select.h b/graphbolt/src/index_select.h
index f78ad98fe078..2522df6523a1 100644
--- a/graphbolt/src/index_select.h
+++ b/graphbolt/src/index_select.h
@@ -92,6 +92,13 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> IndexSelectCSCBatched(
     torch::Tensor nodes, bool with_edge_ids,
     torch::optional<int64_t> output_size);
 
+c10::intrusive_ptr<
+    Future<std::tuple<torch::Tensor, std::vector<torch::Tensor>>>>
+IndexSelectCSCBatchedAsync(
+    torch::Tensor indptr, std::vector<torch::Tensor> indices_list,
+    torch::Tensor nodes, bool with_edge_ids,
+    torch::optional<int64_t> output_size);
+
 }  // namespace ops
 }  // namespace graphbolt
 
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index df295bd718b7..e8d54f9f9a47 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -184,6 +184,7 @@ TORCH_LIBRARY(graphbolt, m) {
   m.def("scatter_async", &ops::ScatterAsync);
   m.def("index_select_csc", &ops::IndexSelectCSC);
   m.def("index_select_csc_batched", &ops::IndexSelectCSCBatched);
+  m.def("index_select_csc_batched_async", &ops::IndexSelectCSCBatchedAsync);
   m.def("ondisk_npy_array", &storage::OnDiskNpyArray::Create);
   m.def("detect_io_uring", &io_uring::IsAvailable);
   m.def("set_num_io_uring_threads", &io_uring::SetNumThreads);
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index f5bea47bea91..fc834718ef4d 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -25,7 +25,6 @@
     "LayerNeighborSampler",
     "SamplePerLayer",
     "FetchInsubgraphData",
-    "ConcatHeteroSeeds",
     "CombineCachedAndFetchedInSubgraph",
 ]
 
@@ -105,29 +104,6 @@ def _wait_replace_future(self, minibatch):
         return minibatch
 
 
-@functional_datapipe("concat_hetero_seeds")
-class ConcatHeteroSeeds(Mapper):
-    """Concatenates the seeds into a single tensor in the hetero case."""
-
-    def __init__(self, datapipe, graph):
-        super().__init__(datapipe, self._concat)
-        self.graph = graph
-
-    def _concat(self, minibatch):
-        seeds = minibatch._seed_nodes
-        if isinstance(seeds, dict):
-            (
-                seeds,
-                seed_offsets,
-            ) = self.graph._convert_to_homogeneous_nodes(seeds)
-        else:
-            seed_offsets = None
-        minibatch._seeds = seeds
-        minibatch._seed_offsets = seed_offsets
-
-        return minibatch
-
-
 @functional_datapipe("fetch_insubgraph_data")
 class FetchInsubgraphData(MiniBatchTransformer):
     """Fetches the insubgraph and wraps it in a FusedCSCSamplingGraph object. If
@@ -142,20 +118,46 @@ def __init__(
         graph,
         prob_name,
     ):
-        datapipe = datapipe.concat_hetero_seeds(graph)
+        datapipe = datapipe.transform(self._concat_hetero_seeds)
         if graph._gpu_graph_cache is not None:
             datapipe = datapipe.fetch_cached_insubgraph_data(
                 graph._gpu_graph_cache
             )
-        datapipe = datapipe.transform(self._fetch_per_layer)
-        datapipe = datapipe.buffer().wait()
+        datapipe = datapipe.transform(self._fetch_per_layer_stage_1)
+        datapipe = datapipe.buffer()
+        datapipe = datapipe.transform(self._fetch_per_layer_stage_2)
         if graph._gpu_graph_cache is not None:
             datapipe = datapipe.combine_cached_and_fetched_insubgraph(prob_name)
         super().__init__(datapipe)
         self.graph = graph
         self.prob_name = prob_name
 
-    def _fetch_per_layer(self, minibatch):
+    def _concat_hetero_seeds(self, minibatch):
+        """Concatenates the seeds into a single tensor in the hetero case."""
+        seeds = minibatch._seed_nodes
+        if isinstance(seeds, dict):
+            (
+                seeds,
+                seed_offsets,
+            ) = self.graph._convert_to_homogeneous_nodes(seeds)
+        else:
+            seed_offsets = None
+        minibatch._seeds = seeds
+        minibatch._seed_offsets = seed_offsets
+
+        return minibatch
+
+    def _fetch_per_layer_stage_1(self, minibatch):
+        minibatch._async_handle_fetch = self._fetch_per_layer_async(minibatch)
+        next(minibatch._async_handle_fetch)
+        return minibatch
+
+    def _fetch_per_layer_stage_2(self, minibatch):
+        minibatch = next(minibatch._async_handle_fetch)
+        delattr(minibatch, "_async_handle_fetch")
+        return minibatch
+
+    def _fetch_per_layer_async(self, minibatch):
         stream = torch.cuda.current_stream()
         uva_stream = get_host_to_device_uva_stream()
         uva_stream.wait_stream(stream)
@@ -167,11 +169,6 @@ def _fetch_per_layer(self, minibatch):
 
             seeds.record_stream(torch.cuda.current_stream())
 
-            def record_stream(tensor):
-                if tensor.is_cuda:
-                    tensor.record_stream(stream)
-                return tensor
-
             # Packs tensors for batch slicing.
             tensors_to_be_sliced = [self.graph.indices]
 
@@ -190,51 +187,53 @@ def record_stream(tensor):
                     has_probs_or_mask = True
 
             # Slices the batched tensors.
-            (
-                indptr,
-                sliced_tensors,
-            ) = torch.ops.graphbolt.index_select_csc_batched(
+            future = torch.ops.graphbolt.index_select_csc_batched_async(
                 self.graph.csc_indptr, tensors_to_be_sliced, seeds, True, None
             )
-            for tensor in [indptr] + sliced_tensors:
-                record_stream(tensor)
 
-            # Unpacks the sliced tensors.
-            indices = sliced_tensors[0]
-            sliced_tensors = sliced_tensors[1:]
+        yield
+
+        # graphbolt::async has already recorded a CUDAEvent for us and
+        # called CUDAStreamWaitEvent for us on the current stream.
+        indptr, sliced_tensors = future.wait()
 
-            type_per_edge = None
-            if has_type_per_edge:
-                type_per_edge = sliced_tensors[0]
-                sliced_tensors = sliced_tensors[1:]
+        for tensor in [indptr] + sliced_tensors:
+            tensor.record_stream(stream)
 
-            probs_or_mask = None
-            if has_probs_or_mask:
-                probs_or_mask = sliced_tensors[0]
-                sliced_tensors = sliced_tensors[1:]
+        # Unpacks the sliced tensors.
+        indices = sliced_tensors[0]
+        sliced_tensors = sliced_tensors[1:]
 
-            edge_ids = sliced_tensors[0]
+        type_per_edge = None
+        if has_type_per_edge:
+            type_per_edge = sliced_tensors[0]
             sliced_tensors = sliced_tensors[1:]
-            assert len(sliced_tensors) == 0
-
-            subgraph = fused_csc_sampling_graph(
-                indptr,
-                indices,
-                node_type_offset=self.graph.node_type_offset,
-                type_per_edge=type_per_edge,
-                node_type_to_id=self.graph.node_type_to_id,
-                edge_type_to_id=self.graph.edge_type_to_id,
-            )
-            if self.prob_name is not None and probs_or_mask is not None:
-                subgraph.add_edge_attribute(self.prob_name, probs_or_mask)
-            subgraph.add_edge_attribute(ORIGINAL_EDGE_ID, edge_ids)
 
-            subgraph._indptr_node_type_offset_list = seed_offsets
-            minibatch._sliced_sampling_graph = subgraph
+        probs_or_mask = None
+        if has_probs_or_mask:
+            probs_or_mask = sliced_tensors[0]
+            sliced_tensors = sliced_tensors[1:]
 
-            minibatch.wait = torch.cuda.current_stream().record_event().wait
+        edge_ids = sliced_tensors[0]
+        sliced_tensors = sliced_tensors[1:]
+        assert len(sliced_tensors) == 0
+
+        subgraph = fused_csc_sampling_graph(
+            indptr,
+            indices,
+            node_type_offset=self.graph.node_type_offset,
+            type_per_edge=type_per_edge,
+            node_type_to_id=self.graph.node_type_to_id,
+            edge_type_to_id=self.graph.edge_type_to_id,
+        )
+        if self.prob_name is not None and probs_or_mask is not None:
+            subgraph.add_edge_attribute(self.prob_name, probs_or_mask)
+        subgraph.add_edge_attribute(ORIGINAL_EDGE_ID, edge_ids)
+
+        subgraph._indptr_node_type_offset_list = seed_offsets
+        minibatch._sliced_sampling_graph = subgraph
 
-            return minibatch
+        yield minibatch
 
 
 @functional_datapipe("sample_per_layer")
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 85e034b123f5..92c670055361 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -132,23 +132,14 @@ def test_gpu_sampling_DataLoader(
     dataloader, dataloader2 = dataloaders
 
     bufferer_cnt = int(enable_feature_fetch and overlap_feature_fetch)
-    awaiter_cnt = 0
     if overlap_graph_fetch:
         bufferer_cnt += num_layers
-        awaiter_cnt += num_layers
-    if asynchronous:
-        bufferer_cnt += 2 * num_layers
-    if overlap_graph_fetch:
-        bufferer_cnt += 0 * num_layers
         if num_gpu_cached_edges > 0:
             bufferer_cnt += 2 * num_layers
+    if asynchronous:
+        bufferer_cnt += 2 * num_layers
     datapipe = dataloader.dataset
     datapipe_graph = traverse_dps(datapipe)
-    awaiters = find_dps(
-        datapipe_graph,
-        dgl.graphbolt.Waiter,
-    )
-    assert len(awaiters) == awaiter_cnt
     bufferers = find_dps(
         datapipe_graph,
         dgl.graphbolt.Bufferer,

From 09ea3196e6c5daabf56dccecb7503aef0c3f5e69 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 17 Aug 2024 00:42:19 -0400
Subject: [PATCH 31/78] [GraphBolt][CUDA] Update CCCL to get backport fix.
 (#7716)

---
 third_party/cccl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cccl b/third_party/cccl
index 1251f54a402d..c67b1c3257be 160000
--- a/third_party/cccl
+++ b/third_party/cccl
@@ -1 +1 @@
-Subproject commit 1251f54a402da44083e22dd75835cfc13eba8d10
+Subproject commit c67b1c3257be5115253f06d45a2d607b54234db4

From b1e39432bf200a9fcd913a71ef565e8730a9e01e Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 18 Aug 2024 10:15:01 -0400
Subject: [PATCH 32/78] [Build] Organize cmake file (Fixed) (#7715)

---
 CMakeLists.txt           | 88 ++++++++++++++++------------------------
 cmake/modules/CUDA.cmake |  4 ++
 graphbolt/CMakeLists.txt |  8 ++++
 graphbolt/build.sh       |  2 +-
 4 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce239dbf0f18..ce6b348c3ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,6 @@ dgl_option(BUILD_TYPE "Type of the build: dev, dogfood or release" "dev")
 message(STATUS "Build for ${BUILD_TYPE}")
 
 dgl_option(USE_CUDA "Build with CUDA" OFF)
-dgl_option(USE_LIBURING "Build with liburing" ON)
 dgl_option(TORCH_PYTHON_INTERPS "Python interpreter for building sub-components" python3)
 
 # Conda build related options.
@@ -129,11 +128,11 @@ if (${BUILD_TYPE} STREQUAL "dev")
   endif()
 else()
   if (MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O2 /DNDEBUG")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /DNDEBUG")
   else()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -DNDEBUG")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG")
   endif()
 endif()
 
@@ -186,48 +185,11 @@ else(MSVC)
   endif(NOT APPLE)
 endif(MSVC)
 
-if(USE_OPENMP)
-  include(FindOpenMP)
-  if(OPENMP_FOUND)
-    set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
-  endif(OPENMP_FOUND)
-  message(STATUS "Build with OpenMP.")
-endif(USE_OPENMP)
-
 if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   message(STATUS "Disabling LIBXSMM on ${CMAKE_SYSTEM_PROCESSOR}.")
   set(USE_LIBXSMM OFF)
 endif()
 
-if(USE_LIBXSMM)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM -DDGL_CPU_LLC_SIZE=40000000 -D__BLAS=0")
-  message(STATUS "Build with LIBXSMM optimization.")
-endif(USE_LIBXSMM)
-
-if ((NOT MSVC) AND USE_EPOLL)
-  INCLUDE(CheckIncludeFile)
-  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
-  if (EPOLL_AVAILABLE)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_EPOLL")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_EPOLL")
-  else()
-    message(WARNING "EPOLL is not available on this platform...")
-  endif()
-endif ()
-
-# To compile METIS correct for DGL.
-if(MSVC)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
-else(MSVC)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-endif(MSVC)
-
-# configure minigun
-add_definitions(-DENABLE_PARTIAL_FRONTIER=0)  # disable minigun partial frontier compile
 # Source file lists
 file(GLOB DGL_SRC
   src/*.cc
@@ -256,6 +218,12 @@ else()
 endif()
 list(APPEND DGL_SRC ${DGL_RPC_SRC})
 
+if(USE_OPENMP)
+  find_package(OpenMP REQUIRED)
+  list(APPEND DGL_LINKER_LIBS OpenMP::OpenMP_CXX)
+  message(STATUS "Build with OpenMP.")
+endif(USE_OPENMP)
+
 # Configure cuda
 if(USE_CUDA)
   file(GLOB_RECURSE DGL_CUDA_SRC
@@ -279,6 +247,16 @@ else(USE_CUDA)
   add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)
 
+if ((NOT MSVC) AND USE_EPOLL)
+  INCLUDE(CheckIncludeFile)
+  check_include_file("sys/epoll.h" EPOLL_AVAILABLE)
+  if (EPOLL_AVAILABLE)
+    target_compile_definitions(dgl PRIVATE USE_EPOLL)
+  else()
+    message(WARNING "EPOLL is not available on this platform...")
+  endif()
+endif ()
+
 # include directories
 target_include_directories(dgl PRIVATE "include")
 # check for conda includes
@@ -351,18 +329,26 @@ else(EXTERNAL_NANOFLANN_PATH)
 endif(EXTERNAL_NANOFLANN_PATH)
 
 if (USE_LIBXSMM)
+  target_compile_definitions(dgl PRIVATE USE_LIBXSMM DGL_CPU_LLC_SIZE=40000000 __BLAS=0)
   target_include_directories(dgl PRIVATE "third_party/libxsmm/include")
+  message(STATUS "Build with LIBXSMM optimization.")
 endif()
 
+# To compile METIS correct for DGL.
+add_compile_definitions(IDXTYPEWIDTH=64 REALTYPEWIDTH=32)
 if (EXTERNAL_METIS_PATH)
+  # To compile METIS correct for DGL.
+  if(MSVC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+  else(MSVC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+  endif(MSVC)
   find_package(METIS REQUIRED)
-  if (NOT METIS_FOUND)
-    message(FATAL_ERROR "Failed to find METIS library")
-  else()
-    message(STATUS "Found METIS library")
-    target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
-    list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
-  endif()
+  message(STATUS "Found METIS library")
+  target_include_directories(dgl SYSTEM PUBLIC ${METIS_INCLUDE_DIR})
+  list(APPEND DGL_LINKER_LIBS ${METIS_LIBRARIES})
 else(EXTERNAL_METIS_PATH)
   target_include_directories(dgl PRIVATE "third_party/METIS/include")
   # Compile METIS
@@ -391,8 +377,6 @@ endif()
 
 # Compile gpu_cache
 if(USE_CUDA)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GPU_CACHE")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU_CACHE")
   # Manually build gpu_cache because CMake always builds it as shared
   file(GLOB gpu_cache_src
     third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
@@ -406,7 +390,7 @@ endif(USE_CUDA)
 
 # support PARALLEL_ALGORITHMS
 if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
-  add_definitions(-DPARALLEL_ALGORITHMS)
+  target_compile_definitions(dgl PRIVATE PARALLEL_ALGORITHMS)
 endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 
 target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 4d96aab63844..2028399f8b66 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -230,6 +230,10 @@ macro(dgl_config_cuda linker_libs)
     string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
   endif()
   list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${CXX_HOST_FLAGS}")
+  if(USE_OPENMP)
+    # Needed by CUDA disjoint union source file.
+    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "${OpenMP_CXX_FLAGS}")
+  endif(USE_OPENMP)
 
   # 1. Add arch flags
   dgl_select_nvcc_arch_flags(NVCC_FLAGS_ARCH)
diff --git a/graphbolt/CMakeLists.txt b/graphbolt/CMakeLists.txt
index 60b1038a1259..8fdcbd5078af 100644
--- a/graphbolt/CMakeLists.txt
+++ b/graphbolt/CMakeLists.txt
@@ -46,6 +46,8 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
 
 set(LIB_GRAPHBOLT_NAME "graphbolt_pytorch_${TORCH_VER}")
 option(BUILD_WITH_TASKFLOW "Use taskflow as parallel backend" ON)
+option(USE_OPENMP "Use OpenMP for graphbolt" ON)
+option(USE_LIBURING "Build graphbolt with liburing support" ON)
 
 set(BOLT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
 set(BOLT_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include")
@@ -85,6 +87,12 @@ if(BUILD_WITH_TASKFLOW)
   target_compile_definitions(${LIB_GRAPHBOLT_NAME} PRIVATE BUILD_WITH_TASKFLOW=1)
 endif()
 
+if(USE_OPENMP)
+  find_package(OpenMP REQUIRED)
+  target_link_libraries(${LIB_GRAPHBOLT_NAME} OpenMP::OpenMP_CXX)
+  message(STATUS "Build graphbolt with OpenMP.")
+endif(USE_OPENMP)
+
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
   if(USE_LIBURING)
     add_definitions(-DHAVE_LIBRARY_LIBURING)
diff --git a/graphbolt/build.sh b/graphbolt/build.sh
index 73389e0c0c1e..7c71f7553a66 100755
--- a/graphbolt/build.sh
+++ b/graphbolt/build.sh
@@ -28,7 +28,7 @@ if ! [[ -z "${CUDAARCHS}" ]]; then
     TORCH_CUDA_ARCH_LIST=${LAST_ARCHITECTURE:0:-1}'.'${LAST_ARCHITECTURE: -1}
   fi
 fi
-CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DGPU_CACHE_BUILD_DIR=$BINDIR -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST -DUSE_LIBURING=$USE_LIBURING"
+CMAKE_FLAGS="-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_TOOLKIT_ROOT_DIR -DUSE_CUDA=$USE_CUDA -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
 echo "graphbolt cmake flags: $CMAKE_FLAGS"
 
 if [ $# -eq 0 ]; then

From fc29d0eb02e91cf825e7a9ec9fd5d608d866fda9 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 18 Aug 2024 10:24:55 -0400
Subject: [PATCH 33/78] [GraphBolt][CUDA] Overlap original edge ids fetch.
 (#7714)

---
 .../src/cuda/extension/gpu_graph_cache.cu     | 101 +++++++++++-------
 .../src/cuda/extension/gpu_graph_cache.h      |  11 +-
 .../impl/fused_csc_sampling_graph.py          |  90 +++++++++++-----
 python/dgl/graphbolt/impl/gpu_graph_cache.py  |  11 +-
 python/dgl/graphbolt/impl/neighbor_sampler.py |  89 ++++++++++++---
 .../graphbolt/impl/test_gpu_graph_cache.py    |   8 +-
 .../graphbolt/impl/test_neighbor_sampler.py   |  12 ++-
 7 files changed, 231 insertions(+), 91 deletions(-)

diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index 80a4bcfd7171..c0e70421bf44 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -115,14 +115,16 @@ constexpr int kIntBlockSize = 512;
 
 c10::intrusive_ptr<GpuGraphCache> GpuGraphCache::Create(
     const int64_t num_edges, const int64_t threshold,
-    torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes) {
+    torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes,
+    bool has_original_edge_ids) {
   return c10::make_intrusive<GpuGraphCache>(
-      num_edges, threshold, indptr_dtype, dtypes);
+      num_edges, threshold, indptr_dtype, dtypes, has_original_edge_ids);
 }
 
 GpuGraphCache::GpuGraphCache(
     const int64_t num_edges, const int64_t threshold,
-    torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes) {
+    torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes,
+    bool has_original_edge_ids) {
   const int64_t initial_node_capacity = 1024;
   AT_DISPATCH_INDEX_TYPES(
       dtypes.at(0), "GpuGraphCache::GpuGraphCache", ([&] {
@@ -149,7 +151,9 @@ GpuGraphCache::GpuGraphCache(
   num_edges_ = 0;
   indptr_ =
       torch::zeros(initial_node_capacity + 1, options.dtype(indptr_dtype));
-  offset_ = torch::empty(indptr_.size(0) - 1, indptr_.options());
+  if (!has_original_edge_ids) {
+    offset_ = torch::empty(indptr_.size(0) - 1, indptr_.options());
+  }
   for (auto dtype : dtypes) {
     cached_edge_tensors_.push_back(
         torch::empty(num_edges, options.dtype(dtype)));
@@ -249,8 +253,9 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
     torch::Tensor seeds, torch::Tensor indices, torch::Tensor positions,
     int64_t num_hit, int64_t num_threshold, torch::Tensor indptr,
     std::vector<torch::Tensor> edge_tensors) {
+  const auto with_edge_ids = offset_.has_value();
   // The last element of edge_tensors has the edge ids.
-  const auto num_tensors = edge_tensors.size() - 1;
+  const auto num_tensors = edge_tensors.size() - with_edge_ids;
   TORCH_CHECK(
       num_tensors == cached_edge_tensors_.size(),
       "Same number of tensors need to be passed!");
@@ -312,8 +317,12 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
               auto input = allocator.AllocateStorage<std::byte*>(num_buffers);
               auto input_size =
                   allocator.AllocateStorage<size_t>(num_buffers + 1);
-              auto edge_id_offsets = torch::empty(
-                  num_nodes, seeds.options().dtype(offset_.scalar_type()));
+              torch::optional<torch::Tensor> edge_id_offsets;
+              if (with_edge_ids) {
+                edge_id_offsets = torch::empty(
+                    num_nodes,
+                    seeds.options().dtype(offset_.value().scalar_type()));
+              }
               const auto cache_missing_dtype_dev_ptr =
                   cache_missing_dtype_dev.get();
               const auto indices_ptr = indices.data_ptr<indices_t>();
@@ -321,12 +330,15 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
               const auto input_ptr = input.get();
               const auto input_size_ptr = input_size.get();
               const auto edge_id_offsets_ptr =
-                  edge_id_offsets.data_ptr<indptr_t>();
+                  edge_id_offsets ? edge_id_offsets->data_ptr<indptr_t>()
+                                  : nullptr;
               const auto cache_indptr = indptr_.data_ptr<indptr_t>();
               const auto missing_indptr = indptr.data_ptr<indptr_t>();
-              const auto cache_offset = offset_.data_ptr<indptr_t>();
+              const auto cache_offset =
+                  offset_ ? offset_->data_ptr<indptr_t>() : nullptr;
               const auto missing_edge_ids =
-                  edge_tensors.back().data_ptr<indptr_t>();
+                  edge_id_offsets ? edge_tensors.back().data_ptr<indptr_t>()
+                                  : nullptr;
               CUB_CALL(DeviceFor::Bulk, num_buffers, [=] __device__(int64_t i) {
                 const auto tensor_idx = i / num_nodes;
                 const auto idx = i % num_nodes;
@@ -340,14 +352,14 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
                 const auto offset_end = is_cached
                                             ? cache_indptr[pos + 1]
                                             : missing_indptr[idx - num_hit + 1];
-                const auto edge_id =
-                    is_cached ? cache_offset[pos] : missing_edge_ids[offset];
                 const auto out_idx = tensor_idx * num_nodes + original_idx;
 
                 input_ptr[out_idx] =
                     (is_cached ? cache_ptr : missing_ptr) + offset * size;
                 input_size_ptr[out_idx] = size * (offset_end - offset);
-                if (i < num_nodes) {
+                if (edge_id_offsets_ptr && i < num_nodes) {
+                  const auto edge_id =
+                      is_cached ? cache_offset[pos] : missing_edge_ids[offset];
                   edge_id_offsets_ptr[out_idx] = edge_id;
                 }
               });
@@ -390,10 +402,12 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
                       indptr_.size(0) * kIntGrowthFactor, indptr_.options());
                   new_indptr.slice(0, 0, indptr_.size(0)) = indptr_;
                   indptr_ = new_indptr;
-                  auto new_offset =
-                      torch::empty(indptr_.size(0) - 1, offset_.options());
-                  new_offset.slice(0, 0, offset_.size(0)) = offset_;
-                  offset_ = new_offset;
+                  if (offset_) {
+                    auto new_offset =
+                        torch::empty(indptr_.size(0) - 1, offset_->options());
+                    new_offset.slice(0, 0, offset_->size(0)) = *offset_;
+                    offset_ = new_offset;
+                  }
                 }
                 torch::Tensor sindptr;
                 bool enough_space;
@@ -415,22 +429,32 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
                 }
                 if (enough_space) {
                   auto num_edges = num_edges_;
-                  auto transform_input_it = thrust::make_zip_iterator(
-                      sindptr.data_ptr<indptr_t>() + 1,
-                      sliced_indptr.data_ptr<indptr_t>());
-                  auto transform_output_it = thrust::make_zip_iterator(
-                      indptr_.data_ptr<indptr_t>() + num_nodes_ + 1,
-                      offset_.data_ptr<indptr_t>() + num_nodes_);
-                  THRUST_CALL(
-                      transform, transform_input_it,
-                      transform_input_it + sindptr.size(0) - 1,
-                      transform_output_it,
-                      [=] __host__ __device__(
-                          const thrust::tuple<indptr_t, indptr_t>& x) {
-                        return thrust::make_tuple(
-                            thrust::get<0>(x) + num_edges,
-                            missing_edge_ids[thrust::get<1>(x)]);
-                      });
+                  if (offset_) {
+                    auto transform_input_it = thrust::make_zip_iterator(
+                        sindptr.data_ptr<indptr_t>() + 1,
+                        sliced_indptr.data_ptr<indptr_t>());
+                    auto transform_output_it = thrust::make_zip_iterator(
+                        indptr_.data_ptr<indptr_t>() + num_nodes_ + 1,
+                        offset_->data_ptr<indptr_t>() + num_nodes_);
+                    THRUST_CALL(
+                        transform, transform_input_it,
+                        transform_input_it + sindptr.size(0) - 1,
+                        transform_output_it,
+                        [=] __host__ __device__(
+                            const thrust::tuple<indptr_t, indptr_t>& x) {
+                          return thrust::make_tuple(
+                              thrust::get<0>(x) + num_edges,
+                              missing_edge_ids[thrust::get<1>(x)]);
+                        });
+                  } else {
+                    THRUST_CALL(
+                        transform, sindptr.data_ptr<indptr_t>() + 1,
+                        sindptr.data_ptr<indptr_t>() + sindptr.size(0),
+                        indptr_.data_ptr<indptr_t>() + num_nodes_ + 1,
+                        [=] __host__ __device__(const indptr_t& x) {
+                          return x + num_edges;
+                        });
+                  }
                   auto map = reinterpret_cast<map_t<indices_t>*>(map_);
                   const dim3 block(kIntBlockSize);
                   const dim3 grid(
@@ -467,10 +491,13 @@ std::tuple<torch::Tensor, std::vector<torch::Tensor>> GpuGraphCache::Replace(
                         .view(edge_tensors[i].scalar_type())
                         .slice(0, 0, static_cast<indptr_t>(output_size)));
               }
-              // Append the edge ids as the last element of the output.
-              output_edge_tensors.push_back(ops::IndptrEdgeIdsImpl(
-                  output_indptr, output_indptr.scalar_type(), edge_id_offsets,
-                  static_cast<int64_t>(static_cast<indptr_t>(output_size))));
+              if (edge_id_offsets) {
+                // Append the edge ids as the last element of the output.
+                output_edge_tensors.push_back(ops::IndptrEdgeIdsImpl(
+                    output_indptr, output_indptr.scalar_type(),
+                    *edge_id_offsets,
+                    static_cast<int64_t>(static_cast<indptr_t>(output_size))));
+              }
 
               {
                 thrust::counting_iterator<int64_t> iota{0};
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.h b/graphbolt/src/cuda/extension/gpu_graph_cache.h
index 0708f5d00917..42324ef33140 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.h
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.h
@@ -47,10 +47,13 @@ class GpuGraphCache : public torch::CustomClassHolder {
    * @param indptr_dtype The node id datatype.
    * @param dtypes The dtypes of the edge tensors to be cached. dtypes[0] is
    * reserved for the indices edge tensor holding node ids.
+   * @param has_original_edge_ids Whether the graph to be cached has original
+   * edge ids.
    */
   GpuGraphCache(
       const int64_t num_edges, const int64_t threshold,
-      torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes);
+      torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes,
+      bool has_original_edge_ids);
 
   GpuGraphCache() = default;
 
@@ -109,7 +112,8 @@ class GpuGraphCache : public torch::CustomClassHolder {
 
   static c10::intrusive_ptr<GpuGraphCache> Create(
       const int64_t num_edges, const int64_t threshold,
-      torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes);
+      torch::ScalarType indptr_dtype, std::vector<torch::ScalarType> dtypes,
+      bool has_original_edge_ids);
 
  private:
   void* map_;                     // pointer to the hash table.
@@ -119,7 +123,8 @@ class GpuGraphCache : public torch::CustomClassHolder {
   int64_t num_nodes_;             // The number of cached nodes in the cache.
   int64_t num_edges_;             // The number of cached edges in the cache.
   torch::Tensor indptr_;          // The cached graph structure indptr tensor.
-  torch::Tensor offset_;          // The original graph's sliced_indptr tensor.
+  torch::optional<torch::Tensor>
+      offset_;  // The original graph's sliced_indptr tensor.
   std::vector<torch::Tensor> cached_edge_tensors_;  // The cached graph
                                                     // structure edge tensors.
   std::mutex mtx_;  // Protects the data structure and makes it threadsafe.
diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index 75f87d19aa2b..cc9137092cf6 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -23,19 +23,32 @@
 
 
 class _SampleNeighborsWaiter:
-    def __init__(self, fn, future, seed_offsets):
+    def __init__(
+        self, fn, future, seed_offsets, fetching_original_edge_ids_is_optional
+    ):
         self.fn = fn
         self.future = future
         self.seed_offsets = seed_offsets
+        self.fetching_original_edge_ids_is_optional = (
+            fetching_original_edge_ids_is_optional
+        )
 
     def wait(self):
         """Returns the stored value when invoked."""
         fn = self.fn
         C_sampled_subgraph = self.future.wait()
         seed_offsets = self.seed_offsets
+        fetching_original_edge_ids_is_optional = (
+            self.fetching_original_edge_ids_is_optional
+        )
         # Ensure there is no memory leak.
         self.fn = self.future = self.seed_offsets = None
-        return fn(C_sampled_subgraph, seed_offsets)
+        self.fetching_original_edge_ids_is_optional = None
+        return fn(
+            C_sampled_subgraph,
+            seed_offsets,
+            fetching_original_edge_ids_is_optional,
+        )
 
 
 class FusedCSCSamplingGraph(SamplingGraph):
@@ -592,6 +605,7 @@ def _convert_to_sampled_subgraph(
         self,
         C_sampled_subgraph: torch.ScriptObject,
         seed_offsets: Optional[list] = None,
+        fetching_original_edge_ids_is_optional: bool = False,
     ) -> SampledSubgraphImpl:
         """An internal function used to convert a fused homogeneous sampled
         subgraph to general struct 'SampledSubgraphImpl'."""
@@ -611,9 +625,15 @@ def _convert_to_sampled_subgraph(
             and ORIGINAL_EDGE_ID in self.edge_attributes
         )
         original_edge_ids = (
-            torch.ops.graphbolt.index_select(
-                self.edge_attributes[ORIGINAL_EDGE_ID],
-                edge_ids_in_fused_csc_sampling_graph,
+            (
+                torch.ops.graphbolt.index_select(
+                    self.edge_attributes[ORIGINAL_EDGE_ID],
+                    edge_ids_in_fused_csc_sampling_graph,
+                )
+                if not fetching_original_edge_ids_is_optional
+                or not edge_ids_in_fused_csc_sampling_graph.is_cuda
+                or not self.edge_attributes[ORIGINAL_EDGE_ID].is_pinned()
+                else None
             )
             if has_original_eids
             else edge_ids_in_fused_csc_sampling_graph
@@ -621,8 +641,8 @@ def _convert_to_sampled_subgraph(
         if type_per_edge is None and etype_offsets is None:
             # The sampled graph is already a homogeneous graph.
             sampled_csc = CSCFormatBase(indptr=indptr, indices=indices)
-            if indices is not None:
-                # Only needed to fetch indices.
+            if indices is not None and original_edge_ids is not None:
+                # Only needed to fetch indices or original_edge_ids.
                 edge_ids_in_fused_csc_sampling_graph = None
         else:
             offset = self._node_type_offset_list
@@ -691,11 +711,17 @@ def _convert_to_sampled_subgraph(
                             ]
                         ]
                     )
-                    original_hetero_edge_ids[etype] = original_edge_ids[
-                        etype_offsets[etype_id] : etype_offsets[etype_id + 1]
-                    ]
-                    if indices is None:
-                        # Only needed to fetch indices.
+                    original_hetero_edge_ids[etype] = (
+                        None
+                        if original_edge_ids is None
+                        else original_edge_ids[
+                            etype_offsets[etype_id] : etype_offsets[
+                                etype_id + 1
+                            ]
+                        ]
+                    )
+                    if indices is None or original_edge_ids is None:
+                        # Only needed to fetch indices or original edge ids.
                         sampled_hetero_edge_ids_in_fused_csc_sampling_graph[
                             etype
                         ] = edge_ids_in_fused_csc_sampling_graph[
@@ -727,7 +753,7 @@ def sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
-        returning_indices_is_optional: bool = False,
+        returning_indices_and_original_edge_ids_are_optional: bool = False,
         async_op: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
@@ -768,10 +794,12 @@ def sample_neighbors(
             corresponding to each neighboring edge of a node. It must be a 1D
             floating-point or boolean tensor, with the number of elements
             equalling the total number of edges.
-        returning_indices_is_optional: bool
+        returning_indices_and_original_edge_ids_are_optional: bool
             Boolean indicating whether it is okay for the call to this function
-            to leave the indices tensor uninitialized. In this case, it is the
-            user's responsibility to gather it using the edge ids.
+            to leave the indices and the original edge ids tensors
+            uninitialized. In this case, it is the user's responsibility to
+            gather them using _edge_ids_in_fused_csc_sampling_graph if either is
+            missing.
         async_op: bool
             Boolean indicating whether the call is asynchronous. If so, the
             result can be obtained by calling wait on the returned future.
@@ -818,7 +846,7 @@ def sample_neighbors(
             fanouts,
             replace=replace,
             probs_or_mask=probs_or_mask,
-            returning_indices_is_optional=returning_indices_is_optional,
+            returning_indices_is_optional=returning_indices_and_original_edge_ids_are_optional,
             async_op=async_op,
         )
         if async_op:
@@ -826,10 +854,13 @@ def sample_neighbors(
                 self._convert_to_sampled_subgraph,
                 C_sampled_subgraph,
                 seed_offsets,
+                returning_indices_and_original_edge_ids_are_optional,
             )
         else:
             return self._convert_to_sampled_subgraph(
-                C_sampled_subgraph, seed_offsets
+                C_sampled_subgraph,
+                seed_offsets,
+                returning_indices_and_original_edge_ids_are_optional,
             )
 
     def _check_sampler_arguments(self, nodes, fanouts, probs_or_mask):
@@ -956,7 +987,7 @@ def sample_layer_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
-        returning_indices_is_optional: bool = False,
+        returning_indices_and_original_edge_ids_are_optional: bool = False,
         random_seed: torch.Tensor = None,
         seed2_contribution: float = 0.0,
         async_op: bool = False,
@@ -1001,10 +1032,12 @@ def sample_layer_neighbors(
             corresponding to each neighboring edge of a node. It must be a 1D
             floating-point or boolean tensor, with the number of elements
             equalling the total number of edges.
-        returning_indices_is_optional: bool
+        returning_indices_and_original_edge_ids_are_optional: bool
             Boolean indicating whether it is okay for the call to this function
-            to leave the indices tensor uninitialized. In this case, it is the
-            user's responsibility to gather it using the edge ids.
+            to leave the indices and the original edge ids tensors
+            uninitialized. In this case, it is the user's responsibility to
+            gather them using _edge_ids_in_fused_csc_sampling_graph if either is
+            missing.
         random_seed: torch.Tensor, optional
             An int64 tensor with one or two elements.
 
@@ -1092,7 +1125,7 @@ def sample_layer_neighbors(
             fanouts.tolist(),
             replace,
             True,  # is_labor
-            returning_indices_is_optional,
+            returning_indices_and_original_edge_ids_are_optional,
             probs_or_mask,
             random_seed,
             seed2_contribution,
@@ -1102,10 +1135,13 @@ def sample_layer_neighbors(
                 self._convert_to_sampled_subgraph,
                 C_sampled_subgraph,
                 seed_offsets,
+                returning_indices_and_original_edge_ids_are_optional,
             )
         else:
             return self._convert_to_sampled_subgraph(
-                C_sampled_subgraph, seed_offsets
+                C_sampled_subgraph,
+                seed_offsets,
+                returning_indices_and_original_edge_ids_are_optional,
             )
 
     def temporal_sample_neighbors(
@@ -1512,15 +1548,21 @@ def _initialize_gpu_graph_cache(
         dtypes = [self.indices.dtype]
         if self.type_per_edge is not None:
             dtypes.append(self.type_per_edge.dtype)
+        has_original_edge_ids = False
         if self.edge_attributes is not None:
             probs_or_mask = self.edge_attributes.get(prob_name, None)
             if probs_or_mask is not None:
                 dtypes.append(probs_or_mask.dtype)
+            original_edge_ids = self.edge_attributes.get(ORIGINAL_EDGE_ID, None)
+            if original_edge_ids is not None:
+                dtypes.append(original_edge_ids.dtype)
+                has_original_edge_ids = True
         self._gpu_graph_cache_ = GPUGraphCache(
             num_gpu_cached_edges,
             gpu_cache_threshold,
             self.csc_indptr.dtype,
             dtypes,
+            has_original_edge_ids,
         )
 
 
diff --git a/python/dgl/graphbolt/impl/gpu_graph_cache.py b/python/dgl/graphbolt/impl/gpu_graph_cache.py
index e4cf78b589af..a6a640dfa6c3 100644
--- a/python/dgl/graphbolt/impl/gpu_graph_cache.py
+++ b/python/dgl/graphbolt/impl/gpu_graph_cache.py
@@ -17,15 +17,19 @@ class GPUGraphCache(object):
         The dtype of the indptr tensor of the graph.
     dtypes : list[torch.dtype]
         The dtypes of the edge tensors that are going to be cached.
+    has_original_edge_ids : bool
+        Whether the graph to be cached has original edge ids.
     """
 
-    def __init__(self, num_edges, threshold, indptr_dtype, dtypes):
+    def __init__(
+        self, num_edges, threshold, indptr_dtype, dtypes, has_original_edge_ids
+    ):
         major, _ = torch.cuda.get_device_capability()
         assert (
             major >= 7
         ), "GPUGraphCache is supported only on CUDA compute capability >= 70 (Volta)."
         self._cache = torch.ops.graphbolt.gpu_graph_cache(
-            num_edges, threshold, indptr_dtype, dtypes
+            num_edges, threshold, indptr_dtype, dtypes, has_original_edge_ids
         )
         self.total_miss = 0
         self.total_queries = 0
@@ -44,7 +48,8 @@ def query(self, keys):
             A tuple containing (missing_keys, replace_fn) where replace_fn is a
             function that should be called with the graph structure
             corresponding to the missing keys. Its arguments are
-            (Tensor, list(Tensor)).
+            (Tensor, list(Tensor)), where the first tensor is the missing indptr
+            and the second list is the missing edge tensors.
         """
         self.total_queries += keys.shape[0]
         (
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index fc834718ef4d..4229edb6be00 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -177,7 +177,8 @@ def _fetch_per_layer_async(self, minibatch):
                 tensors_to_be_sliced.append(self.graph.type_per_edge)
                 has_type_per_edge = True
 
-            has_probs_or_mask = None
+            has_probs_or_mask = False
+            has_original_edge_ids = False
             if self.graph.edge_attributes is not None:
                 probs_or_mask = self.graph.edge_attributes.get(
                     self.prob_name, None
@@ -185,10 +186,21 @@ def _fetch_per_layer_async(self, minibatch):
                 if probs_or_mask is not None:
                     tensors_to_be_sliced.append(probs_or_mask)
                     has_probs_or_mask = True
+                original_edge_ids = self.graph.edge_attributes.get(
+                    ORIGINAL_EDGE_ID, None
+                )
+                if original_edge_ids is not None:
+                    tensors_to_be_sliced.append(original_edge_ids)
+                    has_original_edge_ids = True
 
             # Slices the batched tensors.
             future = torch.ops.graphbolt.index_select_csc_batched_async(
-                self.graph.csc_indptr, tensors_to_be_sliced, seeds, True, None
+                self.graph.csc_indptr,
+                tensors_to_be_sliced,
+                seeds,
+                # When there are no edge ids, we assume it is arange(num_edges).
+                not has_original_edge_ids,
+                None,
             )
 
         yield
@@ -251,19 +263,35 @@ def __init__(
         asynchronous=False,
     ):
         graph = sampler.__self__
-        self.returning_indices_is_optional = False
+        self.returning_indices_and_original_edge_ids_are_optional = False
+        original_edge_ids = (
+            None
+            if graph.edge_attributes is None
+            else graph.edge_attributes.get(ORIGINAL_EDGE_ID, None)
+        )
         if (
             overlap_fetch
             and sampler.__name__ == "sample_neighbors"
-            and graph.indices.is_pinned()
+            and (
+                graph.indices.is_pinned()
+                or (
+                    original_edge_ids is not None
+                    and original_edge_ids.is_pinned()
+                )
+            )
             and graph._gpu_graph_cache is None
         ):
             datapipe = datapipe.transform(self._sample_per_layer)
             if asynchronous:
                 datapipe = datapipe.buffer()
                 datapipe = datapipe.transform(self._wait_subgraph_future)
+            fetch_indices_and_original_edge_ids_fn = partial(
+                self._fetch_indices_and_original_edge_ids,
+                graph.indices,
+                original_edge_ids,
+            )
             datapipe = (
-                datapipe.transform(partial(self._fetch_indices, graph.indices))
+                datapipe.transform(fetch_indices_and_original_edge_ids_fn)
                 .buffer()
                 .wait()
             )
@@ -276,7 +304,7 @@ def __init__(
                         graph.node_type_to_id,
                     )
                 )
-            self.returning_indices_is_optional = True
+            self.returning_indices_and_original_edge_ids_are_optional = True
         elif overlap_fetch:
             datapipe = datapipe.fetch_insubgraph_data(graph, prob_name)
             datapipe = datapipe.transform(
@@ -309,7 +337,7 @@ def _sample_per_layer(self, minibatch):
             self.fanout,
             self.replace,
             self.prob_name,
-            self.returning_indices_is_optional,
+            self.returning_indices_and_original_edge_ids_are_optional,
             async_op=self.asynchronous,
             **kwargs,
         )
@@ -341,7 +369,7 @@ def _wait_subgraph_future(minibatch):
         return minibatch
 
     @staticmethod
-    def _fetch_indices(indices, minibatch):
+    def _fetch_indices_and_original_edge_ids(indices, orig_edge_ids, minibatch):
         stream = torch.cuda.current_stream()
         host_to_device_stream = get_host_to_device_uva_stream()
         host_to_device_stream.wait_stream(stream)
@@ -366,16 +394,43 @@ def record_stream(tensor):
                             index_select(indices, edge_ids)
                         )
                         minibatch._indices_needs_offset_subtraction = True
-            elif subgraph.sampled_csc.indices is None:
-                subgraph._edge_ids_in_fused_csc_sampling_graph.record_stream(
-                    torch.cuda.current_stream()
-                )
-                subgraph.sampled_csc.indices = record_stream(
-                    index_select(
-                        indices, subgraph._edge_ids_in_fused_csc_sampling_graph
+                    if (
+                        orig_edge_ids is not None
+                        and subgraph.original_edge_ids[etype] is None
+                    ):
+                        edge_ids = (
+                            subgraph._edge_ids_in_fused_csc_sampling_graph[
+                                etype
+                            ]
+                        )
+                        edge_ids.record_stream(torch.cuda.current_stream())
+                        subgraph.original_edge_ids[etype] = record_stream(
+                            index_select(orig_edge_ids, edge_ids)
+                        )
+            else:
+                if subgraph.sampled_csc.indices is None:
+                    subgraph._edge_ids_in_fused_csc_sampling_graph.record_stream(
+                        torch.cuda.current_stream()
+                    )
+                    subgraph.sampled_csc.indices = record_stream(
+                        index_select(
+                            indices,
+                            subgraph._edge_ids_in_fused_csc_sampling_graph,
+                        )
+                    )
+                if (
+                    orig_edge_ids is not None
+                    and subgraph.original_edge_ids is None
+                ):
+                    subgraph._edge_ids_in_fused_csc_sampling_graph.record_stream(
+                        torch.cuda.current_stream()
+                    )
+                    subgraph.original_edge_ids = record_stream(
+                        index_select(
+                            orig_edge_ids,
+                            subgraph._edge_ids_in_fused_csc_sampling_graph,
+                        )
                     )
-                )
-                minibatch._indices_needs_offset_subtraction = True
             subgraph._edge_ids_in_fused_csc_sampling_graph = None
             minibatch.wait = torch.cuda.current_stream().record_event().wait
 
diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py b/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
index fdd7329cccbf..e6034cf77019 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
@@ -36,7 +36,8 @@
     ],
 )
 @pytest.mark.parametrize("cache_size", [4, 9, 11])
-def test_gpu_graph_cache(indptr_dtype, dtype, cache_size):
+@pytest.mark.parametrize("with_edge_ids", [True, False])
+def test_gpu_graph_cache(indptr_dtype, dtype, cache_size, with_edge_ids):
     indices_dtype = torch.int32
     indptr = torch.tensor([0, 3, 6, 10], dtype=indptr_dtype, pin_memory=True)
     indices = torch.arange(0, indptr[-1], dtype=indices_dtype, pin_memory=True)
@@ -48,6 +49,7 @@ def test_gpu_graph_cache(indptr_dtype, dtype, cache_size):
         2,
         indptr.dtype,
         [e.dtype for e in edge_tensors],
+        not with_edge_ids,
     )
 
     for i in range(10):
@@ -59,7 +61,7 @@ def test_gpu_graph_cache(indptr_dtype, dtype, cache_size):
             missing_indptr,
             missing_edge_tensors,
         ) = torch.ops.graphbolt.index_select_csc_batched(
-            indptr, edge_tensors, missing_keys, True, None
+            indptr, edge_tensors, missing_keys, with_edge_ids, None
         )
         output_indptr, output_edge_tensors = replace(
             missing_indptr, missing_edge_tensors
@@ -69,7 +71,7 @@ def test_gpu_graph_cache(indptr_dtype, dtype, cache_size):
             reference_indptr,
             reference_edge_tensors,
         ) = torch.ops.graphbolt.index_select_csc_batched(
-            indptr, edge_tensors, keys, True, None
+            indptr, edge_tensors, keys, with_edge_ids, None
         )
 
         assert torch.equal(output_indptr, reference_indptr)
diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
index 3f827923e2f5..547b8867fd41 100644
--- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -3,13 +3,12 @@
 
 import backend as F
 
-import dgl
 import dgl.graphbolt as gb
 import pytest
 import torch
 
 
-def get_hetero_graph():
+def get_hetero_graph(include_original_edge_ids):
     # COO graph:
     # [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
     # [2, 4, 2, 3, 0, 1, 1, 0, 0, 1]
@@ -26,6 +25,10 @@ def get_hetero_graph():
         ),
         "mask": torch.BoolTensor([1, 0, 1, 0, 1, 1, 1, 0, 1, 1]),
     }
+    if include_original_edge_ids:
+        edge_attributes[gb.ORIGINAL_EDGE_ID] = (
+            torch.arange(indices.size(0), 0, -1) - 1
+        )
     node_type_offset = torch.LongTensor([0, 1, 3, 6])
     return gb.fused_csc_sampling_graph(
         indptr,
@@ -44,8 +47,9 @@ def get_hetero_graph():
 @pytest.mark.parametrize("sorted", [False, True])
 @pytest.mark.parametrize("num_cached_edges", [0, 10])
 @pytest.mark.parametrize("is_pinned", [False, True])
+@pytest.mark.parametrize("has_orig_edge_ids", [False, True])
 def test_NeighborSampler_GraphFetch(
-    hetero, prob_name, sorted, num_cached_edges, is_pinned
+    hetero, prob_name, sorted, num_cached_edges, is_pinned, has_orig_edge_ids
 ):
     if sorted:
         items = torch.arange(3)
@@ -53,7 +57,7 @@ def test_NeighborSampler_GraphFetch(
         items = torch.tensor([2, 0, 1])
     names = "seeds"
     itemset = gb.ItemSet(items, names=names)
-    graph = get_hetero_graph()
+    graph = get_hetero_graph(has_orig_edge_ids)
     graph = graph.pin_memory_() if is_pinned else graph.to(F.ctx())
     if hetero:
         itemset = gb.HeteroItemSet({"n3": itemset})

From 2ce0ea0d0d3f01acbada79aac1b2295c7cee90aa Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 18 Aug 2024 19:21:54 -0400
Subject: [PATCH 34/78] [GraphBolt] Fix hetero sampling bug with single fanout.
 (#7719)

---
 graphbolt/src/fused_csc_sampling_graph.cc         |  3 ++-
 .../impl/test_fused_csc_sampling_graph.py         | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc
index 1e5904bd782d..2666e725f2eb 100644
--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
@@ -430,9 +430,10 @@ auto GetPickFn(
           type_per_edge.value(), probs_or_mask, args, picked_data_ptr,
           seed_offset, subgraph_indptr_ptr, etype_id_to_num_picked_offset);
     } else {
+      picked_data_ptr += subgraph_indptr_ptr[seed_offset];
       int64_t num_sampled = Pick(
           offset, num_neighbors, fanouts[0], replace, options, probs_or_mask,
-          args, picked_data_ptr + subgraph_indptr_ptr[seed_offset]);
+          args, picked_data_ptr);
       if (type_per_edge) {
         std::sort(picked_data_ptr, picked_data_ptr + num_sampled);
       }
diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
index fca1dbfdbcbc..93985441370a 100644
--- a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
@@ -1702,6 +1702,21 @@ def test_sample_neighbors_homo(
     assert subgraph.original_row_node_ids is None
 
 
+@pytest.mark.parametrize("labor", [False, True])
+def test_sample_neighbors_hetero_single_fanout(labor):
+    u, i = torch.randint(20, size=(1000,)), torch.randint(10, size=(1000,))
+    graph = dgl.heterograph({("u", "w", "i"): (u, i), ("i", "b", "u"): (i, u)})
+
+    graph = gb.from_dglgraph(graph).to(F.ctx())
+
+    sampler = graph.sample_layer_neighbors if labor else graph.sample_neighbors
+
+    for i in range(11):
+        nodes = {"u": torch.randint(10, (100,), device=F.ctx())}
+        sampler(nodes, fanouts=torch.tensor([-1]))
+    # Should reach here without crashing.
+
+
 @pytest.mark.parametrize("indptr_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("indices_dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("labor", [False, True])

From 0b2d538c01f49b6ef88cb452c4a641ed33dc9156 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 20 Aug 2024 02:46:07 -0400
Subject: [PATCH 35/78] [GraphBolt][io_uring] Document
 `QueueAndBufferAcquirer`. (#7713)

---
 graphbolt/src/cnumpy.cc |  8 ++++----
 graphbolt/src/cnumpy.h  | 24 ++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/graphbolt/src/cnumpy.cc b/graphbolt/src/cnumpy.cc
index 26db0ff1a840..37365e5fc733 100644
--- a/graphbolt/src/cnumpy.cc
+++ b/graphbolt/src/cnumpy.cc
@@ -179,10 +179,10 @@ torch::Tensor OnDiskNpyArray::IndexSelectIOUringImpl(torch::Tensor index) {
     CircularQueue<ReadRequest> read_queue(8 * kGroupSize);
     int64_t num_submitted = 0;
     int64_t num_completed = 0;
-    auto [acquired_queue_handle, my_read_buffer2] = queue_source.get();
+    auto [acquired_queue_handle, read_buffer_source2] = queue_source.get();
     auto &io_uring_queue = acquired_queue_handle.get();
     // Capturing structured binding is available only in C++20, so we rename.
-    auto my_read_buffer = my_read_buffer2;
+    auto read_buffer_source = read_buffer_source2;
     auto submit_fn = [&](int64_t submission_minimum_batch_size) {
       if (read_queue.Size() < submission_minimum_batch_size) return;
       TORCH_CHECK(  // Check for sqe overflow.
@@ -200,8 +200,8 @@ torch::Tensor OnDiskNpyArray::IndexSelectIOUringImpl(torch::Tensor index) {
     };
     for (int64_t read_buffer_slot = 0; true;) {
       auto request_read_buffer = [&]() {
-        return my_read_buffer + (aligned_length_ + block_size_) *
-                                    (read_buffer_slot++ % (8 * kGroupSize));
+        return read_buffer_source + (aligned_length_ + block_size_) *
+                                        (read_buffer_slot++ % (8 * kGroupSize));
       };
       const auto num_requested_items = std::max(
           std::min(
diff --git a/graphbolt/src/cnumpy.h b/graphbolt/src/cnumpy.h
index f853ab70d9ae..793116580963 100644
--- a/graphbolt/src/cnumpy.h
+++ b/graphbolt/src/cnumpy.h
@@ -146,12 +146,21 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
   static inline std::mutex available_queues_mtx_;  // available_queues_ mutex.
   static inline std::vector<int> available_queues_;
 
-  struct QueueAndBufferAcquirer {
-    struct UniqueQueue {
+  /**
+   * @brief This class is meant to distribute the available read buffers and the
+   * statically declared io_uring queues among the worker threads.
+   */
+  class QueueAndBufferAcquirer {
+   public:
+    class UniqueQueue {
+     public:
       UniqueQueue(int thread_id) : thread_id_(thread_id) {}
       UniqueQueue(const UniqueQueue&) = delete;
       UniqueQueue& operator=(const UniqueQueue&) = delete;
 
+      /**
+       * @brief Returns the queue back to the pool.
+       */
       ~UniqueQueue() {
         {
           // We give back the slot we used.
@@ -161,6 +170,9 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
         semaphore_.release();
       }
 
+      /**
+       * @brief Returns the raw io_uring queue.
+       */
       ::io_uring& get() const { return io_uring_queue_[thread_id_]; }
 
      private:
@@ -179,6 +191,14 @@ class OnDiskNpyArray : public torch::CustomClassHolder {
       }
     }
 
+    /**
+     * @brief Returns the secured io_uring queue and the read buffer as a pair.
+     * The raw io_uring queue can be accessed by calling `.get()` on the
+     * returned UniqueQueue object.
+     *
+     * @note The returned UniqueQueue object manages the lifetime of the
+     * io_uring queue. Its destructor returns the queue back to the pool.
+     */
     std::pair<UniqueQueue, char*> get() {
       // We consume a slot from the semaphore to use a queue.
       if (entering_first_.test_and_set(std::memory_order_relaxed)) {

From 513a50f1896bc0169c123363540f821880e84ef2 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 20 Aug 2024 06:04:12 -0400
Subject: [PATCH 36/78] [GraphBolt][Doc] Improve documentation. (#7718)

---
 .../disk_based_feature/node_classification.py     |  4 ++--
 .../graphbolt/pyg/labor/node_classification.py    |  4 ++--
 python/dgl/graphbolt/impl/cpu_cached_feature.py   | 15 ++++++++++-----
 python/dgl/graphbolt/impl/gpu_cached_feature.py   | 14 ++++++++++----
 .../graphbolt/impl/torch_based_feature_store.py   | 14 +++++++-------
 .../graphbolt/impl/test_cpu_cached_feature.py     |  1 +
 .../graphbolt/impl/test_gpu_cached_feature.py     |  1 +
 7 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index aaca410947cc..626f4f37e99a 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -464,7 +464,7 @@ def main():
             args.feature_device == "pinned",
         )
         cpu_cached_feature = features[("node", None, "feat")]
-        cpu_cache_miss_rate_fn = lambda: cpu_cached_feature._feature.miss_rate
+        cpu_cache_miss_rate_fn = lambda: cpu_cached_feature.miss_rate
     else:
         cpu_cache_miss_rate_fn = lambda: 1
 
@@ -479,7 +479,7 @@ def main():
             int(args.gpu_cache_size_in_gigabytes * 1024 * 1024 * 1024),
         )
         gpu_cached_feature = features[("node", None, "feat")]
-        gpu_cache_miss_rate_fn = lambda: gpu_cached_feature._feature.miss_rate
+        gpu_cache_miss_rate_fn = lambda: gpu_cached_feature.miss_rate
     else:
         gpu_cache_miss_rate_fn = lambda: 1
 
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index ab0c843fcec8..7be4c195182d 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -501,7 +501,7 @@ def main():
             args.feature_device == "pinned",
         )
         cpu_cached_feature = features[("node", None, "feat")]
-        cpu_cache_miss_rate_fn = lambda: cpu_cached_feature._feature.miss_rate
+        cpu_cache_miss_rate_fn = lambda: cpu_cached_feature.miss_rate
     else:
         cpu_cache_miss_rate_fn = lambda: 1
     if args.num_gpu_cached_features > 0 and args.feature_device != "cuda":
@@ -510,7 +510,7 @@ def main():
             args.num_gpu_cached_features * feature_num_bytes,
         )
         gpu_cached_feature = features[("node", None, "feat")]
-        gpu_cache_miss_rate_fn = lambda: gpu_cached_feature._feature.miss_rate
+        gpu_cache_miss_rate_fn = lambda: gpu_cached_feature.miss_rate
     else:
         gpu_cache_miss_rate_fn = lambda: 1
 
diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index e6da31f87de2..2c845f7ac436 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -30,8 +30,8 @@ class CPUCachedFeature(Feature):
         will hang due to all cache entries being read and/or write locked,
         resulting in a deadlock.
     policy : str
-        The cache eviction policy algorithm name. See gb.impl.CPUFeatureCache
-        for the list of available policies.
+        The cache eviction policy algorithm name. The available policies are
+        ["s3-fifo", "sieve", "lru", "clock"]. Default is "sieve".
     pin_memory : bool
         Whether the cache storage should be allocated on system pinned memory.
         Default is False.
@@ -94,9 +94,9 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            `read_async_num_stages(ids.device)`th invocation. The return result
-            can be accessed by calling `.wait()`. on the returned future object.
-            It is undefined behavior to call `.wait()` more than once.
+            ``read_async_num_stages(ids.device)``th invocation. The return result
+            can be accessed by calling ``.wait()``. on the returned future object.
+            It is undefined behavior to call ``.wait()`` more than once.
 
         Examples
         --------
@@ -449,3 +449,8 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         else:
             self._fallback_feature.update(value, ids)
             self._feature.replace(ids, value)
+
+    @property
+    def miss_rate(self):
+        """Returns the cache miss rate since creation."""
+        return self._feature.miss_rate
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index e19c8752fa2a..5a92a0175e12 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -17,7 +17,8 @@ def num_cache_items(cache_capacity_in_bytes, single_item):
 
 
 class GPUCachedFeature(Feature):
-    r"""GPU cached feature wrapping a fallback feature.
+    r"""GPU cached feature wrapping a fallback feature. It uses the least
+    recently used (LRU) algorithm as the cache eviction policy.
 
     Places the GPU cache to torch.cuda.current_device().
 
@@ -100,9 +101,9 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            `read_async_num_stages(ids.device)`th invocation. The return result
-            can be accessed by calling `.wait()`. on the returned future object.
-            It is undefined behavior to call `.wait()` more than once.
+            ``read_async_num_stages(ids.device)``th invocation. The return result
+            can be accessed by calling ``.wait()``. on the returned future object.
+            It is undefined behavior to call ``.wait()`` more than once.
 
         Examples
         --------
@@ -219,3 +220,8 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         else:
             self._fallback_feature.update(value, ids)
             self._feature.replace(ids, value)
+
+    @property
+    def miss_rate(self):
+        """Returns the cache miss rate since creation."""
+        return self._feature.miss_rate
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 47c1b469b47c..9337c8cb4f0e 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -151,9 +151,9 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            `read_async_num_stages(ids.device)`th invocation. The return result
-            can be accessed by calling `.wait()`. on the returned future object.
-            It is undefined behavior to call `.wait()` more than once.
+            ``read_async_num_stages(ids.device)``th invocation. The return result
+            can be accessed by calling ``.wait()``. on the returned future object.
+            It is undefined behavior to call ``.wait()`` more than once.
 
         Examples
         --------
@@ -424,9 +424,9 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            `read_async_num_stages(ids.device)`th invocation. The return result
-            can be accessed by calling `.wait()`. on the returned future object.
-            It is undefined behavior to call `.wait()` more than once.
+            ``read_async_num_stages(ids.device)``th invocation. The return result
+            can be accessed by calling ``.wait()``. on the returned future object.
+            It is undefined behavior to call ``.wait()`` more than once.
 
         Examples
         --------
@@ -520,7 +520,7 @@ def to(self, _):  # pylint: disable=invalid-name
         return self
 
     def pin_memory_(self):  # pylint: disable=invalid-name
-        """Placeholder `DiskBasedFeature` pin_memory_ implementation. It is a no-op."""
+        r"""Placeholder `DiskBasedFeature` pin_memory_ implementation. It is a no-op."""
         gb_warning(
             "`DiskBasedFeature.pin_memory_()` is not supported. Leaving unmodified."
         )
diff --git a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
index 47ce0e866f20..582d9fe93908 100644
--- a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
@@ -79,6 +79,7 @@ def test_cpu_cached_feature(dtype, policy):
     total_miss = feat_store_b._feature.total_miss
     feat_store_b.read(torch.tensor([0, 1]))
     assert total_miss == feat_store_b._feature.total_miss
+    assert feat_store_a._feature.miss_rate == feat_store_a.miss_rate
 
     # Test get the size of the entire feature with ids.
     assert feat_store_a.size() == torch.Size([3])
diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
index 05020157a826..9a9019ccab55 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
@@ -83,6 +83,7 @@ def test_gpu_cached_feature(dtype, cache_size_a, cache_size_b):
         total_miss = feat_store_b._feature.total_miss
         feat_store_b.read(torch.tensor([0, 1]).to("cuda"))
         assert total_miss == feat_store_b._feature.total_miss
+    assert feat_store_a._feature.miss_rate == feat_store_a.miss_rate
 
     # Test get the size of the entire feature with ids.
     assert feat_store_a.size() == torch.Size([3])

From e5a5d76012ae11f1bc366bc43dd233971f0e5058 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 20 Aug 2024 13:46:39 -0400
Subject: [PATCH 37/78] [GraphBolt] Always enable prefetch before `CopyTo`.
 (#7721)

---
 python/dgl/graphbolt/dataloader.py | 33 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index 8b00faba2cca..b964f71efe40 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -170,23 +170,22 @@ def __init__(
         # before it. This enables enables non_blocking copies to the device.
         # Prefetching enables the data pipeline up to the CopyTo to run in a
         # separate thread.
-        if torch.cuda.is_available():
-            copiers = find_dps(datapipe_graph, CopyTo)
-            if len(copiers) > 1:
-                gb_warning(
-                    "Multiple CopyTo operations were found in the datapipe graph."
-                    " This case is not officially supported."
-                )
-            for copier in copiers:
-                if copier.device.type == "cuda":
-                    datapipe_graph = replace_dp(
-                        datapipe_graph,
-                        copier,
-                        # Add prefetch so that CPU and GPU can run concurrently.
-                        copier.datapipe.prefetch(2).copy_to(
-                            copier.device, non_blocking=True
-                        ),
-                    )
+        copiers = find_dps(datapipe_graph, CopyTo)
+        if len(copiers) > 1:
+            gb_warning(
+                "Multiple CopyTo operations were found in the datapipe graph."
+                " This case is not officially supported."
+            )
+        for copier in copiers:
+            # We enable the prefetch at all times for good CPU only performance.
+            datapipe_graph = replace_dp(
+                datapipe_graph,
+                copier,
+                # Add prefetch so that CPU and GPU can run concurrently.
+                copier.datapipe.prefetch(2).copy_to(
+                    copier.device, non_blocking=True
+                ),
+            )
 
         # The stages after feature fetching is still done in the main process.
         # So we set num_workers to 0 here.

From aca40c869c0c4ddc4a710b8b016e9e6d158f6f66 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 20 Aug 2024 14:38:36 -0400
Subject: [PATCH 38/78] [GraphBolt] Make `CachePolicy` hetero capable [1]
 (#7723)

---
 graphbolt/src/partitioned_cache_policy.cc     | 54 +++++++++++++---
 graphbolt/src/partitioned_cache_policy.h      | 17 ++++--
 .../dgl/graphbolt/impl/cpu_cached_feature.py  | 11 ++--
 python/dgl/graphbolt/impl/feature_cache.py    | 20 ++++--
 .../dgl/graphbolt/impl/gpu_cached_feature.py  | 17 ++++--
 .../graphbolt/impl/test_feature_cache.py      | 61 ++++++++++++-------
 6 files changed, 127 insertions(+), 53 deletions(-)

diff --git a/graphbolt/src/partitioned_cache_policy.cc b/graphbolt/src/partitioned_cache_policy.cc
index 61dfc3b0cb96..4aee026ff0d4 100644
--- a/graphbolt/src/partitioned_cache_policy.cc
+++ b/graphbolt/src/partitioned_cache_policy.cc
@@ -20,6 +20,7 @@
 #include "./partitioned_cache_policy.h"
 
 #include <algorithm>
+#include <limits>
 #include <numeric>
 
 #include "./utils.h"
@@ -27,7 +28,29 @@
 namespace graphbolt {
 namespace storage {
 
-constexpr int kIntGrainSize = 64;
+constexpr int kIntGrainSize = 256;
+
+torch::Tensor AddOffset(torch::Tensor keys, int64_t offset) {
+  if (offset == 0) return keys;
+  auto output = torch::empty_like(
+      keys, keys.options().pinned_memory(utils::is_pinned(keys)));
+  AT_DISPATCH_INDEX_TYPES(
+      keys.scalar_type(), "AddOffset", ([&] {
+        auto keys_ptr = keys.data_ptr<index_t>();
+        auto output_ptr = output.data_ptr<index_t>();
+        graphbolt::parallel_for_each(
+            0, keys.numel(), kIntGrainSize, [&](int64_t i) {
+              const auto result = keys_ptr[i] + offset;
+              if constexpr (!std::is_same_v<index_t, int64_t>) {
+                TORCH_CHECK(
+                    std::numeric_limits<index_t>::min() <= result &&
+                    result <= std::numeric_limits<index_t>::max());
+              }
+              output_ptr[i] = static_cast<index_t>(result);
+            });
+      }));
+  return output;
+}
 
 template <typename CachePolicy>
 PartitionedCachePolicy::PartitionedCachePolicy(
@@ -117,7 +140,8 @@ PartitionedCachePolicy::Partition(torch::Tensor keys) {
 std::tuple<
     torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
     torch::Tensor>
-PartitionedCachePolicy::Query(torch::Tensor keys) {
+PartitionedCachePolicy::Query(torch::Tensor keys, const int64_t offset) {
+  keys = AddOffset(keys, offset);
   if (policies_.size() == 1) {
     std::lock_guard lock(mtx_);
     auto [positions, output_indices, missing_keys, found_pointers] =
@@ -133,6 +157,7 @@ PartitionedCachePolicy::Query(torch::Tensor keys) {
     found_and_missing_offsets_ptr[3] = missing_keys.size(0);
     auto found_offsets = found_and_missing_offsets.slice(0, 0, 2);
     auto missing_offsets = found_and_missing_offsets.slice(0, 2);
+    missing_keys = AddOffset(missing_keys, -offset);
     return {positions,      output_indices, missing_keys,
             found_pointers, found_offsets,  missing_offsets};
   };
@@ -211,17 +236,18 @@ PartitionedCachePolicy::Query(torch::Tensor keys) {
         num_missing * missing_keys.element_size());
   });
   auto found_offsets = result_offsets_tensor.slice(0, 0, policies_.size() + 1);
+  missing_keys = AddOffset(missing_keys, -offset);
   return std::make_tuple(
       positions, output_indices, missing_keys, found_pointers, found_offsets,
       missing_offsets);
 }
 
 c10::intrusive_ptr<Future<std::vector<torch::Tensor>>>
-PartitionedCachePolicy::QueryAsync(torch::Tensor keys) {
+PartitionedCachePolicy::QueryAsync(torch::Tensor keys, const int64_t offset) {
   return async([=] {
     auto
         [positions, output_indices, missing_keys, found_pointers, found_offsets,
-         missing_offsets] = Query(keys);
+         missing_offsets] = Query(keys, offset);
     return std::vector{positions,      output_indices, missing_keys,
                        found_pointers, found_offsets,  missing_offsets};
   });
@@ -230,7 +256,9 @@ PartitionedCachePolicy::QueryAsync(torch::Tensor keys) {
 std::tuple<
     torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
     torch::Tensor>
-PartitionedCachePolicy::QueryAndReplace(torch::Tensor keys) {
+PartitionedCachePolicy::QueryAndReplace(
+    torch::Tensor keys, const int64_t offset) {
+  keys = AddOffset(keys, offset);
   if (policies_.size() == 1) {
     std::lock_guard lock(mtx_);
     auto [positions, output_indices, pointers, missing_keys] =
@@ -246,6 +274,7 @@ PartitionedCachePolicy::QueryAndReplace(torch::Tensor keys) {
     found_and_missing_offsets_ptr[3] = missing_keys.size(0);
     auto found_offsets = found_and_missing_offsets.slice(0, 0, 2);
     auto missing_offsets = found_and_missing_offsets.slice(0, 2);
+    missing_keys = AddOffset(missing_keys, -offset);
     return {positions,    output_indices, pointers,
             missing_keys, found_offsets,  missing_offsets};
   }
@@ -336,17 +365,19 @@ PartitionedCachePolicy::QueryAndReplace(torch::Tensor keys) {
         num_missing * missing_keys.element_size());
   });
   auto found_offsets = result_offsets_tensor.slice(0, 0, policies_.size() + 1);
+  missing_keys = AddOffset(missing_keys, -offset);
   return std::make_tuple(
       positions, output_indices, pointers, missing_keys, found_offsets,
       missing_offsets);
 }
 
 c10::intrusive_ptr<Future<std::vector<torch::Tensor>>>
-PartitionedCachePolicy::QueryAndReplaceAsync(torch::Tensor keys) {
+PartitionedCachePolicy::QueryAndReplaceAsync(
+    torch::Tensor keys, const int64_t offset) {
   return async([=] {
     auto
         [positions, output_indices, pointers, missing_keys, found_offsets,
-         missing_offsets] = QueryAndReplace(keys);
+         missing_offsets] = QueryAndReplace(keys, offset);
     return std::vector{positions,    output_indices, pointers,
                        missing_keys, found_offsets,  missing_offsets};
   });
@@ -354,7 +385,9 @@ PartitionedCachePolicy::QueryAndReplaceAsync(torch::Tensor keys) {
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
 PartitionedCachePolicy::Replace(
-    torch::Tensor keys, torch::optional<torch::Tensor> offsets) {
+    torch::Tensor keys, torch::optional<torch::Tensor> offsets,
+    const int64_t offset) {
+  keys = AddOffset(keys, offset);
   if (policies_.size() == 1) {
     std::lock_guard lock(mtx_);
     auto [positions, pointers] = policies_[0]->Replace(keys);
@@ -419,9 +452,10 @@ PartitionedCachePolicy::Replace(
 
 c10::intrusive_ptr<Future<std::vector<torch::Tensor>>>
 PartitionedCachePolicy::ReplaceAsync(
-    torch::Tensor keys, torch::optional<torch::Tensor> offsets) {
+    torch::Tensor keys, torch::optional<torch::Tensor> offsets,
+    const int64_t offset) {
   return async([=] {
-    auto [positions, pointers, offsets_out] = Replace(keys, offsets);
+    auto [positions, pointers, offsets_out] = Replace(keys, offsets, offset);
     return std::vector{positions, pointers, offsets_out};
   });
 }
diff --git a/graphbolt/src/partitioned_cache_policy.h b/graphbolt/src/partitioned_cache_policy.h
index 030aab4f4054..4b9dc7710b0e 100644
--- a/graphbolt/src/partitioned_cache_policy.h
+++ b/graphbolt/src/partitioned_cache_policy.h
@@ -56,6 +56,7 @@ class PartitionedCachePolicy : public torch::CustomClassHolder {
   /**
    * @brief The policy query function.
    * @param keys The keys to query the cache.
+   * @param offset The offset to be added to the keys.
    *
    * @return (positions, indices, missing_keys, found_ptrs, found_offsets,
    * missing_offsets), where positions has the locations of the keys which were
@@ -69,14 +70,15 @@ class PartitionedCachePolicy : public torch::CustomClassHolder {
   std::tuple<
       torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
       torch::Tensor>
-  Query(torch::Tensor keys);
+  Query(torch::Tensor keys, int64_t offset);
 
   c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAsync(
-      torch::Tensor keys);
+      torch::Tensor keys, int64_t offset);
 
   /**
    * @brief The policy query and then replace function.
    * @param keys The keys to query the cache.
+   * @param offset The offset to be added to the keys.
    *
    * @return (positions, indices, pointers, missing_keys, found_offsets,
    * missing_offsets), where positions has the locations of the keys which were
@@ -92,25 +94,28 @@ class PartitionedCachePolicy : public torch::CustomClassHolder {
   std::tuple<
       torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
       torch::Tensor>
-  QueryAndReplace(torch::Tensor keys);
+  QueryAndReplace(torch::Tensor keys, int64_t offset);
 
   c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAndReplaceAsync(
-      torch::Tensor keys);
+      torch::Tensor keys, int64_t offset);
 
   /**
    * @brief The policy replace function.
    * @param keys The keys to query the cache.
    * @param offsets The partition offsets for the keys.
+   * @param offset The offset to be added to the keys.
    *
    * @return (positions, pointers, offsets), where positions holds the locations
    * of the replaced entries in the cache, pointers holds the CacheKey pointers
    * for the inserted keys and offsets holds the partition offsets for pointers.
    */
   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Replace(
-      torch::Tensor keys, torch::optional<torch::Tensor> offsets);
+      torch::Tensor keys, torch::optional<torch::Tensor> offsets,
+      int64_t offset);
 
   c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> ReplaceAsync(
-      torch::Tensor keys, torch::optional<torch::Tensor> offsets);
+      torch::Tensor keys, torch::optional<torch::Tensor> offsets,
+      int64_t offset);
 
   template <bool write>
   void ReadingWritingCompletedImpl(
diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 2c845f7ac436..858e5912046e 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -61,6 +61,7 @@ def __init__(
             pin_memory=pin_memory,
         )
         self._is_pinned = pin_memory
+        self._offset = 0
 
     def read(self, ids: torch.Tensor = None):
         """Read the feature by index.
@@ -79,7 +80,7 @@ def read(self, ids: torch.Tensor = None):
         if ids is None:
             return self._fallback_feature.read()
         return self._feature.query_and_replace(
-            ids.cpu(), self._fallback_feature.read
+            ids.cpu(), self._fallback_feature.read, self._offset
         ).to(ids.device)
 
     def read_async(self, ids: torch.Tensor):
@@ -124,7 +125,7 @@ def read_async(self, ids: torch.Tensor):
             yield  # first stage is done.
 
             ids_copy_event.synchronize()
-            policy_future = policy.query_and_replace_async(ids)
+            policy_future = policy.query_and_replace_async(ids, self._offset)
 
             yield
 
@@ -241,7 +242,7 @@ def wait(self):
             yield  # first stage is done.
 
             ids_copy_event.synchronize()
-            policy_future = policy.query_and_replace_async(ids)
+            policy_future = policy.query_and_replace_async(ids, self._offset)
 
             yield
 
@@ -319,7 +320,7 @@ def wait(self):
 
             yield _Waiter([values_copy_event, writing_completed], values)
         else:
-            policy_future = policy.query_and_replace_async(ids)
+            policy_future = policy.query_and_replace_async(ids, self._offset)
 
             yield
 
@@ -448,7 +449,7 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
             )
         else:
             self._fallback_feature.update(value, ids)
-            self._feature.replace(ids, value)
+            self._feature.replace(ids, value, None, self._offset)
 
     @property
     def miss_rate(self):
diff --git a/python/dgl/graphbolt/impl/feature_cache.py b/python/dgl/graphbolt/impl/feature_cache.py
index 136ad2a7314c..38093db5b06a 100644
--- a/python/dgl/graphbolt/impl/feature_cache.py
+++ b/python/dgl/graphbolt/impl/feature_cache.py
@@ -59,13 +59,15 @@ def __init__(
         self.total_miss = 0
         self.total_queries = 0
 
-    def query(self, keys):
+    def query(self, keys, offset=0):
         """Queries the cache.
 
         Parameters
         ----------
         keys : Tensor
             The keys to query the cache with.
+        offset : int
+            The offset to be added to the keys. Default is 0.
 
         Returns
         -------
@@ -85,14 +87,14 @@ def query(self, keys):
             found_pointers,
             found_offsets,
             missing_offsets,
-        ) = self._policy.query(keys)
+        ) = self._policy.query(keys, offset)
         values = self._cache.query(positions, index, keys.shape[0])
         self._policy.reading_completed(found_pointers, found_offsets)
         self.total_miss += missing_keys.shape[0]
         missing_index = index[positions.size(0) :]
         return values, missing_index, missing_keys, missing_offsets
 
-    def query_and_replace(self, keys, reader_fn):
+    def query_and_replace(self, keys, reader_fn, offset=0):
         """Queries the cache. Then inserts the keys that are not found by
         reading them by calling `reader_fn(missing_keys)`, which are then
         inserted into the cache using the selected caching policy algorithm
@@ -105,6 +107,8 @@ def query_and_replace(self, keys, reader_fn):
         reader_fn : reader_fn(keys: torch.Tensor) -> torch.Tensor
             A function that will take a missing keys tensor and will return
             their values.
+        offset : int
+            The offset to be added to the keys. Default is 0.
 
         Returns
         -------
@@ -120,7 +124,7 @@ def query_and_replace(self, keys, reader_fn):
             missing_keys,
             found_offsets,
             missing_offsets,
-        ) = self._policy.query_and_replace(keys)
+        ) = self._policy.query_and_replace(keys, offset)
         found_cnt = keys.size(0) - missing_keys.size(0)
         found_positions = positions[:found_cnt]
         values = self._cache.query(found_positions, index, keys.shape[0])
@@ -136,7 +140,7 @@ def query_and_replace(self, keys, reader_fn):
         self._policy.writing_completed(missing_pointers, missing_offsets)
         return values
 
-    def replace(self, keys, values, offsets=None):
+    def replace(self, keys, values, offsets=None, offset=0):
         """Inserts key-value pairs into the cache using the selected caching
         policy algorithm to remove old key-value pairs if it is full.
 
@@ -148,8 +152,12 @@ def replace(self, keys, values, offsets=None):
             The values to insert to the cache.
         offsets : Tensor, optional
             The partition offsets of the keys.
+        offset : int
+            The offset to be added to the keys. Default is 0.
         """
-        positions, pointers, offsets = self._policy.replace(keys, offsets)
+        positions, pointers, offsets = self._policy.replace(
+            keys, offsets, offset
+        )
         self._cache.replace(positions, values)
         self._policy.writing_completed(pointers, offsets)
 
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index 5a92a0175e12..55c89d07811f 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -63,6 +63,7 @@ def __init__(self, fallback_feature: Feature, max_cache_size_in_bytes: int):
         feat0 = fallback_feature.read(torch.tensor([0]))
         cache_size = num_cache_items(max_cache_size_in_bytes, feat0)
         self._feature = GPUCache((cache_size,) + feat0.shape[1:], feat0.dtype)
+        self._offset = 0
 
     def read(self, ids: torch.Tensor = None):
         """Read the feature by index.
@@ -83,8 +84,12 @@ def read(self, ids: torch.Tensor = None):
         """
         if ids is None:
             return self._fallback_feature.read()
-        values, missing_index, missing_keys = self._feature.query(ids)
-        missing_values = self._fallback_feature.read(missing_keys)
+        values, missing_index, missing_keys = self._feature.query(
+            ids if self._offset == 0 else ids + self._offset
+        )
+        missing_values = self._fallback_feature.read(
+            missing_keys if self._offset == 0 else missing_keys - self._offset
+        )
         values[missing_index] = missing_values
         self._feature.replace(missing_keys, missing_values)
         return values
@@ -115,13 +120,17 @@ def read_async(self, ids: torch.Tensor):
         >>> assert stage + 1 == feature.read_async_num_stages(ids.device)
         >>> result = future.wait()  # result contains the read values.
         """
-        future = self._feature.query(ids, async_op=True)
+        future = self._feature.query(
+            ids if self._offset == 0 else ids + self._offset, async_op=True
+        )
 
         yield
 
         values, missing_index, missing_keys = future.wait()
 
-        fallback_reader = self._fallback_feature.read_async(missing_keys)
+        fallback_reader = self._fallback_feature.read_async(
+            missing_keys if self._offset == 0 else missing_keys - self._offset
+        )
         fallback_num_stages = self._fallback_feature.read_async_num_stages(
             missing_keys.device
         )
diff --git a/tests/python/pytorch/graphbolt/impl/test_feature_cache.py b/tests/python/pytorch/graphbolt/impl/test_feature_cache.py
index 478a5eb52d56..1a50d5dc2e0b 100644
--- a/tests/python/pytorch/graphbolt/impl/test_feature_cache.py
+++ b/tests/python/pytorch/graphbolt/impl/test_feature_cache.py
@@ -6,7 +6,7 @@
 from dgl import graphbolt as gb
 
 
-def _test_query_and_replace(policy1, policy2, keys):
+def _test_query_and_replace(policy1, policy2, keys, offset):
     # Testing query_and_replace equivalence to query and then replace.
     (
         _,
@@ -15,7 +15,7 @@ def _test_query_and_replace(policy1, policy2, keys):
         missing_keys,
         found_offsets,
         missing_offsets,
-    ) = policy1.query_and_replace(keys)
+    ) = policy1.query_and_replace(keys, offset)
     found_cnt = keys.size(0) - missing_keys.size(0)
     found_pointers = pointers[:found_cnt]
     policy1.reading_completed(found_pointers, found_offsets)
@@ -29,15 +29,15 @@ def _test_query_and_replace(policy1, policy2, keys):
         found_pointers2,
         found_offsets2,
         missing_offsets2,
-    ) = policy2.query(keys)
+    ) = policy2.query(keys + offset, 0)
     policy2.reading_completed(found_pointers2, found_offsets2)
     (_, missing_pointers2, missing_offsets2) = policy2.replace(
-        missing_keys2, missing_offsets2
+        missing_keys2, missing_offsets2, 0
     )
     policy2.writing_completed(missing_pointers2, missing_offsets2)
 
     assert torch.equal(index, index2)
-    assert torch.equal(missing_keys, missing_keys2)
+    assert torch.equal(missing_keys, missing_keys2 - offset)
 
 
 @pytest.mark.parametrize("offsets", [False, True])
@@ -59,7 +59,8 @@ def _test_query_and_replace(policy1, policy2, keys):
 @pytest.mark.parametrize("feature_size", [2, 16])
 @pytest.mark.parametrize("num_parts", [1, 2, None])
 @pytest.mark.parametrize("policy", ["s3-fifo", "sieve", "lru", "clock"])
-def test_feature_cache(offsets, dtype, feature_size, num_parts, policy):
+@pytest.mark.parametrize("offset", [0, 1111111])
+def test_feature_cache(offsets, dtype, feature_size, num_parts, policy, offset):
     cache_size = 32 * (
         torch.get_num_threads() if num_parts is None else num_parts
     )
@@ -79,7 +80,9 @@ def test_feature_cache(offsets, dtype, feature_size, num_parts, policy):
     reader_fn = lambda keys: a[keys]
 
     keys = torch.tensor([0, 1])
-    values, missing_index, missing_keys, missing_offsets = cache.query(keys)
+    values, missing_index, missing_keys, missing_offsets = cache.query(
+        keys, offset
+    )
     if not offsets:
         missing_offsets = None
     assert torch.equal(
@@ -88,17 +91,21 @@ def test_feature_cache(offsets, dtype, feature_size, num_parts, policy):
     )
 
     missing_values = a[missing_keys]
-    cache.replace(missing_keys, missing_values, missing_offsets)
+    cache.replace(missing_keys, missing_values, missing_offsets, offset)
     values[missing_index] = missing_values
     assert torch.equal(values, a[keys])
-    assert torch.equal(cache2.query_and_replace(keys, reader_fn), a[keys])
+    assert torch.equal(
+        cache2.query_and_replace(keys, reader_fn, offset), a[keys]
+    )
 
-    _test_query_and_replace(policy1, policy2, keys)
+    _test_query_and_replace(policy1, policy2, keys, offset)
 
     pin_memory = F._default_context_str == "gpu"
 
     keys = torch.arange(1, 33, pin_memory=pin_memory)
-    values, missing_index, missing_keys, missing_offsets = cache.query(keys)
+    values, missing_index, missing_keys, missing_offsets = cache.query(
+        keys, offset
+    )
     if not offsets:
         missing_offsets = None
     assert torch.equal(
@@ -108,38 +115,48 @@ def test_feature_cache(offsets, dtype, feature_size, num_parts, policy):
     assert not pin_memory or values.is_pinned()
 
     missing_values = a[missing_keys]
-    cache.replace(missing_keys, missing_values, missing_offsets)
+    cache.replace(missing_keys, missing_values, missing_offsets, offset)
     values[missing_index] = missing_values
     assert torch.equal(values, a[keys])
-    assert torch.equal(cache2.query_and_replace(keys, reader_fn), a[keys])
+    assert torch.equal(
+        cache2.query_and_replace(keys, reader_fn, offset), a[keys]
+    )
 
-    _test_query_and_replace(policy1, policy2, keys)
+    _test_query_and_replace(policy1, policy2, keys, offset)
 
-    values, missing_index, missing_keys, missing_offsets = cache.query(keys)
+    values, missing_index, missing_keys, missing_offsets = cache.query(
+        keys, offset
+    )
     if not offsets:
         missing_offsets = None
     assert torch.equal(missing_keys.flip([0]), torch.tensor([]))
 
     missing_values = a[missing_keys]
-    cache.replace(missing_keys, missing_values, missing_offsets)
+    cache.replace(missing_keys, missing_values, missing_offsets, offset)
     values[missing_index] = missing_values
     assert torch.equal(values, a[keys])
-    assert torch.equal(cache2.query_and_replace(keys, reader_fn), a[keys])
+    assert torch.equal(
+        cache2.query_and_replace(keys, reader_fn, offset), a[keys]
+    )
 
-    _test_query_and_replace(policy1, policy2, keys)
+    _test_query_and_replace(policy1, policy2, keys, offset)
 
-    values, missing_index, missing_keys, missing_offsets = cache.query(keys)
+    values, missing_index, missing_keys, missing_offsets = cache.query(
+        keys, offset
+    )
     if not offsets:
         missing_offsets = None
     assert torch.equal(missing_keys.flip([0]), torch.tensor([]))
 
     missing_values = a[missing_keys]
-    cache.replace(missing_keys, missing_values, missing_offsets)
+    cache.replace(missing_keys, missing_values, missing_offsets, offset)
     values[missing_index] = missing_values
     assert torch.equal(values, a[keys])
-    assert torch.equal(cache2.query_and_replace(keys, reader_fn), a[keys])
+    assert torch.equal(
+        cache2.query_and_replace(keys, reader_fn, offset), a[keys]
+    )
 
-    _test_query_and_replace(policy1, policy2, keys)
+    _test_query_and_replace(policy1, policy2, keys, offset)
 
     assert cache.miss_rate == cache2.miss_rate
 

From 470cdc578943ad219398aad1264fe2f19e2dedd8 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Tue, 20 Aug 2024 22:23:54 -0700
Subject: [PATCH 39/78] [Warnings] Removing warnings appearing in
 `feature_fetcher` tests. (#7633)

---
 .../pytorch/graphbolt/test_feature_fetcher.py       | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/python/pytorch/graphbolt/test_feature_fetcher.py b/tests/python/pytorch/graphbolt/test_feature_fetcher.py
index e945d90d2389..6e8d1975e53f 100644
--- a/tests/python/pytorch/graphbolt/test_feature_fetcher.py
+++ b/tests/python/pytorch/graphbolt/test_feature_fetcher.py
@@ -1,8 +1,7 @@
 import random
-from enum import Enum
+from functools import partial
 
 import dgl.graphbolt as gb
-import pytest
 import torch
 from torch.utils.data.datapipes.iter import Mapper
 
@@ -68,6 +67,10 @@ def test_FeatureFetcher_homo():
     assert len(list(fetcher_dp)) == 5
 
 
+def _func(fn, minibatch):
+    return fn(minibatch)
+
+
 def test_FeatureFetcher_with_edges_homo():
     graph = gb_test_utils.rand_csc_graph(20, 0.15, bidirection_edge=True)
     a = torch.tensor(
@@ -106,7 +109,8 @@ def add_node_and_edge_ids(minibatch):
 
     itemset = gb.ItemSet(torch.arange(10), names="seeds")
     item_sampler_dp = gb.ItemSampler(itemset, batch_size=2)
-    converter_dp = Mapper(item_sampler_dp, add_node_and_edge_ids)
+    fn = partial(_func, add_node_and_edge_ids)
+    converter_dp = Mapper(item_sampler_dp, fn)
     fetcher_dp = gb.FeatureFetcher(converter_dp, feature_store, ["a"], ["b"])
 
     assert len(list(fetcher_dp)) == 5
@@ -232,7 +236,8 @@ def add_node_and_edge_ids(minibatch):
         }
     )
     item_sampler_dp = gb.ItemSampler(itemset, batch_size=2)
-    converter_dp = Mapper(item_sampler_dp, add_node_and_edge_ids)
+    fn = partial(_func, add_node_and_edge_ids)
+    converter_dp = Mapper(item_sampler_dp, fn)
     # "n3:e3:n3" is not in the sampled edges.
     # Do not fetch feature for "n2:e2:n1".
     node_feature_keys = {"n1": ["a"]}

From 0e649fc68c14e3d9fbc96e59b893880a65b5f155 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 21 Aug 2024 11:15:09 -0400
Subject: [PATCH 40/78] [GraphBolt] Rename feature cache files and add
 `max_size`. (#7729)

---
 graphbolt/src/feature_cache.cc                           | 5 +++++
 graphbolt/src/feature_cache.h                            | 1 -
 graphbolt/src/python_binding.cc                          | 1 +
 python/dgl/graphbolt/impl/__init__.py                    | 4 ++--
 python/dgl/graphbolt/impl/cpu_cached_feature.py          | 2 +-
 .../impl/{feature_cache.py => cpu_feature_cache.py}      | 9 +++++++++
 python/dgl/graphbolt/impl/gpu_cached_feature.py          | 8 +++++---
 .../impl/{gpu_cache.py => gpu_feature_cache.py}          | 9 +++++++--
 8 files changed, 30 insertions(+), 9 deletions(-)
 rename python/dgl/graphbolt/impl/{feature_cache.py => cpu_feature_cache.py} (95%)
 rename python/dgl/graphbolt/impl/{gpu_cache.py => gpu_feature_cache.py} (88%)

diff --git a/graphbolt/src/feature_cache.cc b/graphbolt/src/feature_cache.cc
index 6c7738712aee..4f451bd3ef65 100644
--- a/graphbolt/src/feature_cache.cc
+++ b/graphbolt/src/feature_cache.cc
@@ -66,7 +66,12 @@ torch::Tensor FeatureCache::IndexSelect(torch::Tensor positions) {
 }
 
 void FeatureCache::Replace(torch::Tensor positions, torch::Tensor values) {
+  TORCH_CHECK(positions.size(0) == values.size(0));
+  if (values.numel() == 0) return;
   const auto row_bytes = values.slice(0, 0, 1).numel() * values.element_size();
+  TORCH_CHECK(
+      row_bytes == tensor_.slice(0, 0, 1).numel() * tensor_.element_size(),
+      "The # bytes of a single row should match the cache's.");
   auto values_ptr = reinterpret_cast<std::byte*>(values.data_ptr());
   const auto tensor_ptr = reinterpret_cast<std::byte*>(tensor_.data_ptr());
   const auto positions_ptr = positions.data_ptr<int64_t>();
diff --git a/graphbolt/src/feature_cache.h b/graphbolt/src/feature_cache.h
index 3a5a05a86843..43e98c1c615f 100644
--- a/graphbolt/src/feature_cache.h
+++ b/graphbolt/src/feature_cache.h
@@ -87,7 +87,6 @@ struct FeatureCache : public torch::CustomClassHolder {
       const std::vector<int64_t>& shape, torch::ScalarType dtype,
       bool pin_memory);
 
- private:
   torch::Tensor tensor_;
 };
 
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index e8d54f9f9a47..85bf17ed7024 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -167,6 +167,7 @@ TORCH_LIBRARY(graphbolt, m) {
       "clock_cache_policy",
       &storage::PartitionedCachePolicy::Create<storage::ClockCachePolicy>);
   m.class_<storage::FeatureCache>("FeatureCache")
+      .def_readonly("tensor", &storage::FeatureCache::tensor_)
       .def("index_select", &storage::FeatureCache::IndexSelect)
       .def("query", &storage::FeatureCache::Query)
       .def("query_async", &storage::FeatureCache::QueryAsync)
diff --git a/python/dgl/graphbolt/impl/__init__.py b/python/dgl/graphbolt/impl/__init__.py
index 5b92bb83f078..19fef44e462c 100644
--- a/python/dgl/graphbolt/impl/__init__.py
+++ b/python/dgl/graphbolt/impl/__init__.py
@@ -1,7 +1,7 @@
 """Implementation of GraphBolt."""
 from .basic_feature_store import *
 from .fused_csc_sampling_graph import *
-from .gpu_cache import *
+from .gpu_feature_cache import *
 from .gpu_cached_feature import *
 from .in_subgraph_sampler import *
 from .legacy_dataset import *
@@ -13,5 +13,5 @@
 from .torch_based_feature_store import *
 from .uniform_negative_sampler import *
 from .gpu_graph_cache import *
-from .feature_cache import *
+from .cpu_feature_cache import *
 from .cpu_cached_feature import *
diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 858e5912046e..8fa626c2cf73 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -5,7 +5,7 @@
 from ..base import get_device_to_host_uva_stream, get_host_to_device_uva_stream
 from ..feature_store import Feature
 
-from .feature_cache import CPUFeatureCache
+from .cpu_feature_cache import CPUFeatureCache
 
 __all__ = ["CPUCachedFeature"]
 
diff --git a/python/dgl/graphbolt/impl/feature_cache.py b/python/dgl/graphbolt/impl/cpu_feature_cache.py
similarity index 95%
rename from python/dgl/graphbolt/impl/feature_cache.py
rename to python/dgl/graphbolt/impl/cpu_feature_cache.py
index 38093db5b06a..74e054033bb0 100644
--- a/python/dgl/graphbolt/impl/feature_cache.py
+++ b/python/dgl/graphbolt/impl/cpu_feature_cache.py
@@ -59,6 +59,15 @@ def __init__(
         self.total_miss = 0
         self.total_queries = 0
 
+    def is_pinned(self):
+        """Returns True if the cache storage is pinned."""
+        return self._cache.tensor.is_pinned()
+
+    @property
+    def max_size_in_bytes(self):
+        """Return the size taken by the cache in bytes."""
+        return self._cache.tensor.nbytes
+
     def query(self, keys, offset=0):
         """Queries the cache.
 
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index 55c89d07811f..621349f4d419 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -4,7 +4,7 @@
 
 from ..feature_store import Feature
 
-from .gpu_cache import GPUCache
+from .gpu_feature_cache import GPUFeatureCache
 
 __all__ = ["GPUCachedFeature"]
 
@@ -62,7 +62,9 @@ def __init__(self, fallback_feature: Feature, max_cache_size_in_bytes: int):
         # Fetching the feature dimension from the underlying feature.
         feat0 = fallback_feature.read(torch.tensor([0]))
         cache_size = num_cache_items(max_cache_size_in_bytes, feat0)
-        self._feature = GPUCache((cache_size,) + feat0.shape[1:], feat0.dtype)
+        self._feature = GPUFeatureCache(
+            (cache_size,) + feat0.shape[1:], feat0.dtype
+        )
         self._offset = 0
 
     def read(self, ids: torch.Tensor = None):
@@ -223,7 +225,7 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
                 value.shape[0],
             )
             self._feature = None  # Destroy the existing cache first.
-            self._feature = GPUCache(
+            self._feature = GPUFeatureCache(
                 (cache_size,) + feat0.shape[1:], feat0.dtype
             )
         else:
diff --git a/python/dgl/graphbolt/impl/gpu_cache.py b/python/dgl/graphbolt/impl/gpu_feature_cache.py
similarity index 88%
rename from python/dgl/graphbolt/impl/gpu_cache.py
rename to python/dgl/graphbolt/impl/gpu_feature_cache.py
index 413fa5527a7a..3ef2fd154cd2 100644
--- a/python/dgl/graphbolt/impl/gpu_cache.py
+++ b/python/dgl/graphbolt/impl/gpu_feature_cache.py
@@ -1,16 +1,21 @@
 """HugeCTR gpu_cache wrapper for graphbolt."""
+from functools import reduce
+from operator import mul
+
 import torch
 
 
-class GPUCache(object):
+class GPUFeatureCache(object):
     """High-level wrapper for GPU embedding cache"""
 
     def __init__(self, cache_shape, dtype):
         major, _ = torch.cuda.get_device_capability()
         assert (
             major >= 7
-        ), "GPUCache is supported only on CUDA compute capability >= 70 (Volta)."
+        ), "GPUFeatureCache is supported only on CUDA compute capability >= 70 (Volta)."
         self._cache = torch.ops.graphbolt.gpu_cache(cache_shape, dtype)
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        self.max_size_in_bytes = reduce(mul, cache_shape) * element_size
         self.total_miss = 0
         self.total_queries = 0
 

From 8bdcd7eeea8e59804fba55f68685a57dde0220c8 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 21 Aug 2024 11:27:33 -0400
Subject: [PATCH 41/78] [GraphBolt] `Feature.count()`. (#7730)

---
 python/dgl/graphbolt/feature_store.py         | 33 +++++++++++++++++++
 .../dgl/graphbolt/impl/cpu_cached_feature.py  | 10 ++++++
 .../dgl/graphbolt/impl/gpu_cached_feature.py  | 10 ++++++
 .../impl/torch_based_feature_store.py         | 20 +++++++++++
 .../impl/test_basic_feature_store.py          |  4 ++-
 .../graphbolt/impl/test_cpu_cached_feature.py |  4 ++-
 .../impl/test_disk_based_feature_store.py     |  4 ++-
 .../graphbolt/impl/test_gpu_cached_feature.py |  4 ++-
 .../impl/test_torch_based_feature_store.py    |  4 ++-
 9 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/python/dgl/graphbolt/feature_store.py b/python/dgl/graphbolt/feature_store.py
index 33efbd70891f..25c5f2fe9353 100644
--- a/python/dgl/graphbolt/feature_store.py
+++ b/python/dgl/graphbolt/feature_store.py
@@ -93,6 +93,16 @@ def size(self):
         """
         raise NotImplementedError
 
+    def count(self):
+        """Get the count of the feature.
+
+        Returns
+        -------
+        int
+            The count of the feature.
+        """
+        raise NotImplementedError
+
     def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         """Update the feature.
 
@@ -194,6 +204,29 @@ def size(
         """
         return self.__getitem__((domain, type_name, feature_name)).size()
 
+    def count(
+        self,
+        domain: str,
+        type_name: str,
+        feature_name: str,
+    ):
+        """Get the count the specified feature in the feature store.
+
+        Parameters
+        ----------
+        domain : str
+            The domain of the feature such as "node", "edge" or "graph".
+        type_name : str
+            The node or edge type name.
+        feature_name : str
+            The feature name.
+        Returns
+        -------
+        int
+            The count of the specified feature in the feature store.
+        """
+        return self.__getitem__((domain, type_name, feature_name)).count()
+
     def metadata(
         self,
         domain: str,
diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 8fa626c2cf73..96bb31fc6b86 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -422,6 +422,16 @@ def size(self):
         """
         return self._fallback_feature.size()
 
+    def count(self):
+        """Get the count of the feature.
+
+        Returns
+        -------
+        int
+            The count of the feature.
+        """
+        return self._fallback_feature.count()
+
     def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         """Update the feature.
 
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index 621349f4d419..c6903e208698 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -203,6 +203,16 @@ def size(self):
         """
         return self._fallback_feature.size()
 
+    def count(self):
+        """Get the count of the feature.
+
+        Returns
+        -------
+        int
+            The count of the feature.
+        """
+        return self._fallback_feature.count()
+
     def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         """Update the feature.
 
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 9337c8cb4f0e..42d5e7a859f1 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -239,6 +239,16 @@ def size(self):
         """
         return self._tensor.size()[1:]
 
+    def count(self):
+        """Get the count of the feature.
+
+        Returns
+        -------
+        int
+            The count of the feature.
+        """
+        return self._tensor.size()[0]
+
     def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         """Update the feature store.
 
@@ -493,6 +503,16 @@ def size(self):
         """
         return self._tensor.size()[1:]
 
+    def count(self):
+        """Get the count of the feature.
+
+        Returns
+        -------
+        int
+            The count of the feature.
+        """
+        return self._tensor.size()[0]
+
     def update(self, value: torch.Tensor, ids: torch.Tensor = None):
         """Disk based feature does not support update for now."""
         raise NotImplementedError
diff --git a/tests/python/pytorch/graphbolt/impl/test_basic_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_basic_feature_store.py
index d82e5a81135c..261ac9d36bca 100644
--- a/tests/python/pytorch/graphbolt/impl/test_basic_feature_store.py
+++ b/tests/python/pytorch/graphbolt/impl/test_basic_feature_store.py
@@ -43,9 +43,11 @@ def test_basic_feature_store_homo():
         torch.tensor([[[1, 2], [3, 4]]]),
     )
 
-    # Test get the size of the entire feature.
+    # Test get the size and count of the entire feature.
     assert feature_store.size("node", None, "a") == torch.Size([3])
     assert feature_store.size("node", None, "b") == torch.Size([2, 2])
+    assert feature_store.count("node", None, "a") == a.size(0)
+    assert feature_store.count("node", None, "b") == b.size(0)
 
     # Test get metadata of the feature.
     assert feature_store.metadata("node", None, "a") == metadata
diff --git a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
index 582d9fe93908..ea8c5e9122a9 100644
--- a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
@@ -81,9 +81,11 @@ def test_cpu_cached_feature(dtype, policy):
     assert total_miss == feat_store_b._feature.total_miss
     assert feat_store_a._feature.miss_rate == feat_store_a.miss_rate
 
-    # Test get the size of the entire feature with ids.
+    # Test get the size and count of the entire feature.
     assert feat_store_a.size() == torch.Size([3])
     assert feat_store_b.size() == torch.Size([2, 2])
+    assert feat_store_a.count() == a.size(0)
+    assert feat_store_b.count() == b.size(0)
 
     # Test update the entire feature.
     feat_store_a.update(torch.tensor([[0, 1, 2], [3, 5, 2]], dtype=dtype))
diff --git a/tests/python/pytorch/graphbolt/impl/test_disk_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_disk_based_feature_store.py
index 1fd5d3f9d3c7..300d98a4efc1 100644
--- a/tests/python/pytorch/graphbolt/impl/test_disk_based_feature_store.py
+++ b/tests/python/pytorch/graphbolt/impl/test_disk_based_feature_store.py
@@ -82,9 +82,11 @@ def test_disk_based_feature():
         ind_c = torch.randint(low=0, high=c.size(0), size=(4111,))
         assert_equal(feature_c.read(ind_c), c[ind_c])
 
-        # Test get the size of the entire feature.
+        # Test get the size and count of the entire feature.
         assert feature_a.size() == torch.Size([3])
         assert feature_b.size() == torch.Size([2, 2])
+        assert feature_a.count() == a.size(0)
+        assert feature_b.count() == b.size(0)
 
         # Test get metadata of the feature.
         assert feature_a.metadata() == metadata
diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
index 9a9019ccab55..4e2e2fabcd91 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
@@ -85,9 +85,11 @@ def test_gpu_cached_feature(dtype, cache_size_a, cache_size_b):
         assert total_miss == feat_store_b._feature.total_miss
     assert feat_store_a._feature.miss_rate == feat_store_a.miss_rate
 
-    # Test get the size of the entire feature with ids.
+    # Test get the size and count of the entire feature.
     assert feat_store_a.size() == torch.Size([3])
     assert feat_store_b.size() == torch.Size([2, 2])
+    assert feat_store_a.count() == a.size(0)
+    assert feat_store_b.count() == b.size(0)
 
     # Test update the entire feature.
     feat_store_a.update(
diff --git a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
index 445e5dbe8d71..ff821b8092ff 100644
--- a/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
+++ b/tests/python/pytorch/graphbolt/impl/test_torch_based_feature_store.py
@@ -79,9 +79,11 @@ def test_torch_based_feature(in_memory):
             ),
         )
 
-        # Test get the size of the entire feature.
+        # Test get the size and count of the entire feature.
         assert feature_a.size() == torch.Size([3])
         assert feature_b.size() == torch.Size([2, 2])
+        assert feature_a.count() == 1
+        assert feature_b.count() == 3
 
         # Test get metadata of the feature.
         assert feature_a.metadata() == metadata

From c45d299c1d0064d2c4b1d0eb1deea365d9d503a0 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 21 Aug 2024 13:27:26 -0400
Subject: [PATCH 42/78] [GraphBolt] `CachedFeature` can handle hetero features
 now. (#7731)

---
 docs/source/api/python/dgl.graphbolt.rst      |   2 +
 .../disk_based_feature/node_classification.py |   4 +-
 .../pyg/labor/node_classification.py          |   4 +-
 .../pyg/node_classification_advanced.py       |   2 +-
 .../multigpu/graphbolt/node_classification.py |   2 +-
 python/dgl/graphbolt/feature_store.py         |  64 +++++++++-
 .../dgl/graphbolt/impl/cpu_cached_feature.py  | 110 +++++++++++-------
 .../dgl/graphbolt/impl/gpu_cached_feature.py  |  83 +++++++++----
 .../graphbolt/impl/test_cpu_cached_feature.py |   8 +-
 .../graphbolt/impl/test_gpu_cached_feature.py |  16 +--
 .../impl/test_hetero_cached_feature.py        |  40 +++++++
 11 files changed, 252 insertions(+), 83 deletions(-)
 create mode 100644 tests/python/pytorch/graphbolt/impl/test_hetero_cached_feature.py

diff --git a/docs/source/api/python/dgl.graphbolt.rst b/docs/source/api/python/dgl.graphbolt.rst
index 9f2220f10d85..98e9cc71552d 100644
--- a/docs/source/api/python/dgl.graphbolt.rst
+++ b/docs/source/api/python/dgl.graphbolt.rst
@@ -56,7 +56,9 @@ collection of features.
     TorchBasedFeature
     TorchBasedFeatureStore
     DiskBasedFeature
+    cpu_cached_feature
     CPUCachedFeature
+    gpu_cached_feature
     GPUCachedFeature
 
 
diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index 626f4f37e99a..be477e23c3d7 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -457,7 +457,7 @@ def main():
     if args.cpu_cache_size_in_gigabytes > 0 and isinstance(
         features[("node", None, "feat")], gb.DiskBasedFeature
     ):
-        features[("node", None, "feat")] = gb.CPUCachedFeature(
+        features[("node", None, "feat")] = gb.cpu_cached_feature(
             features[("node", None, "feat")],
             int(args.cpu_cache_size_in_gigabytes * 1024 * 1024 * 1024),
             args.cpu_feature_cache_policy,
@@ -474,7 +474,7 @@ def main():
     host-to-device copy operations for this feature.
     """
     if args.gpu_cache_size_in_gigabytes > 0 and args.feature_device != "cuda":
-        features[("node", None, "feat")] = gb.GPUCachedFeature(
+        features[("node", None, "feat")] = gb.gpu_cached_feature(
             features[("node", None, "feat")],
             int(args.gpu_cache_size_in_gigabytes * 1024 * 1024 * 1024),
         )
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index 7be4c195182d..8c1cef5f4915 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -494,7 +494,7 @@ def main():
     if args.num_cpu_cached_features > 0 and isinstance(
         features[("node", None, "feat")], gb.DiskBasedFeature
     ):
-        features[("node", None, "feat")] = gb.CPUCachedFeature(
+        features[("node", None, "feat")] = gb.cpu_cached_feature(
             features[("node", None, "feat")],
             args.num_cpu_cached_features * feature_num_bytes,
             args.cpu_feature_cache_policy,
@@ -505,7 +505,7 @@ def main():
     else:
         cpu_cache_miss_rate_fn = lambda: 1
     if args.num_gpu_cached_features > 0 and args.feature_device != "cuda":
-        features[("node", None, "feat")] = gb.GPUCachedFeature(
+        features[("node", None, "feat")] = gb.gpu_cached_feature(
             features[("node", None, "feat")],
             args.num_gpu_cached_features * feature_num_bytes,
         )
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 27bc82275d77..44eea9566573 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -441,7 +441,7 @@ def main():
     num_classes = dataset.tasks[0].metadata["num_classes"]
 
     if args.gpu_cache_size > 0 and args.feature_device != "cuda":
-        features._features[("node", None, "feat")] = gb.GPUCachedFeature(
+        features._features[("node", None, "feat")] = gb.gpu_cached_feature(
             features._features[("node", None, "feat")],
             args.gpu_cache_size,
         )
diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index 6c5ce52942dc..f4d1e40f0d6a 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -302,7 +302,7 @@ def run(rank, world_size, args, devices, dataset):
     out_size = num_classes
 
     if args.gpu_cache_size > 0 and args.storage_device != "cuda":
-        feature[("node", None, "feat")] = gb.GPUCachedFeature(
+        feature[("node", None, "feat")] = gb.gpu_cached_feature(
             feature[("node", None, "feat")],
             args.gpu_cache_size,
         )
diff --git a/python/dgl/graphbolt/feature_store.py b/python/dgl/graphbolt/feature_store.py
index 25c5f2fe9353..6eb61da2dd67 100644
--- a/python/dgl/graphbolt/feature_store.py
+++ b/python/dgl/graphbolt/feature_store.py
@@ -1,10 +1,16 @@
 """Feature store for GraphBolt."""
 
-from typing import NamedTuple
+from typing import Dict, NamedTuple, Union
 
 import torch
 
-__all__ = ["Feature", "FeatureStore", "FeatureKey"]
+__all__ = [
+    "bytes_to_number_of_items",
+    "Feature",
+    "FeatureStore",
+    "FeatureKey",
+    "wrap_with_cached_feature",
+]
 
 
 class FeatureKey(NamedTuple):
@@ -289,3 +295,57 @@ def keys(self):
             feat_name)` format.
         """
         raise NotImplementedError
+
+
+def bytes_to_number_of_items(cache_capacity_in_bytes, single_item):
+    """Returns the number of rows to be cached."""
+    item_bytes = single_item.nbytes
+    # Round up so that we never get a size of 0, unless bytes is 0.
+    return (cache_capacity_in_bytes + item_bytes - 1) // item_bytes
+
+
+def wrap_with_cached_feature(
+    cached_feature_type,
+    fallback_features: Union[Feature, Dict[FeatureKey, Feature]],
+    max_cache_size_in_bytes: int,
+    *args,
+    **kwargs,
+) -> Union[Feature, Dict[FeatureKey, Feature]]:
+    """Wraps the given features with the given cached feature type using
+    a single cache instance."""
+    if not isinstance(fallback_features, dict):
+        assert isinstance(fallback_features, Feature)
+        return wrap_with_cached_feature(
+            cached_feature_type,
+            {"a": fallback_features},
+            max_cache_size_in_bytes,
+            *args,
+            **kwargs,
+        )["a"]
+    row_bytes = None
+    cache = None
+    wrapped_features = {}
+    offset = 0
+    for feature_key, fallback_feature in fallback_features.items():
+        # Fetching the feature dimension from the underlying feature.
+        feat0 = fallback_feature.read(torch.tensor([0]))
+        if row_bytes is None:
+            row_bytes = feat0.nbytes
+        else:
+            assert (
+                row_bytes == feat0.nbytes
+            ), "The # bytes of a single row of the features should match."
+        cache_size = bytes_to_number_of_items(max_cache_size_in_bytes, feat0)
+        if cache is None:
+            cache = cached_feature_type._cache_type(
+                cache_shape=(cache_size,) + feat0.shape[1:],
+                dtype=feat0.dtype,
+                *args,
+                **kwargs,
+            )
+        wrapped_features[feature_key] = cached_feature_type(
+            fallback_feature, cache=cache, offset=offset
+        )
+        offset += fallback_feature.count()
+
+    return wrapped_features
diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 96bb31fc6b86..80fd2974671e 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -1,48 +1,44 @@
 """CPU cached feature for GraphBolt."""
+from typing import Dict, Optional, Union
 
 import torch
 
 from ..base import get_device_to_host_uva_stream, get_host_to_device_uva_stream
-from ..feature_store import Feature
+from ..feature_store import (
+    bytes_to_number_of_items,
+    Feature,
+    FeatureKey,
+    wrap_with_cached_feature,
+)
 
 from .cpu_feature_cache import CPUFeatureCache
 
-__all__ = ["CPUCachedFeature"]
-
-
-def bytes_to_number_of_items(cache_capacity_in_bytes, single_item):
-    """Returns the number of rows to be cached."""
-    item_bytes = single_item.nbytes
-    # Round up so that we never get a size of 0, unless bytes is 0.
-    return (cache_capacity_in_bytes + item_bytes - 1) // item_bytes
+__all__ = ["CPUCachedFeature", "cpu_cached_feature"]
 
 
 class CPUCachedFeature(Feature):
-    r"""CPU cached feature wrapping a fallback feature.
+    r"""CPU cached feature wrapping a fallback feature. Use `cpu_feature_cache`
+    to construct an instance of this class.
 
     Parameters
     ----------
     fallback_feature : Feature
         The fallback feature.
-    max_cache_size_in_bytes : int
-        The capacity of the cache in bytes. The size should be a few factors
-        larger than the size of each read request. Otherwise, the caching policy
-        will hang due to all cache entries being read and/or write locked,
-        resulting in a deadlock.
-    policy : str
-        The cache eviction policy algorithm name. The available policies are
-        ["s3-fifo", "sieve", "lru", "clock"]. Default is "sieve".
-    pin_memory : bool
-        Whether the cache storage should be allocated on system pinned memory.
-        Default is False.
+    cache : CPUFeatureCache
+        A CPUFeatureCache instance to serve as the cache backend.
+    offset : int, optional
+        The offset value to add to the given ids before using the cache. This
+        parameter is useful if multiple `CPUCachedFeature`s are sharing a single
+        CPUFeatureCache object.
     """
 
+    _cache_type = CPUFeatureCache
+
     def __init__(
         self,
         fallback_feature: Feature,
-        max_cache_size_in_bytes: int,
-        policy: str = None,
-        pin_memory: bool = False,
+        cache: CPUFeatureCache,
+        offset: int = 0,
     ):
         super(CPUCachedFeature, self).__init__()
         assert isinstance(fallback_feature, Feature), (
@@ -50,18 +46,8 @@ def __init__(
             f"{type(fallback_feature)}."
         )
         self._fallback_feature = fallback_feature
-        self.max_cache_size_in_bytes = max_cache_size_in_bytes
-        # Fetching the feature dimension from the underlying feature.
-        feat0 = fallback_feature.read(torch.tensor([0]))
-        cache_size = bytes_to_number_of_items(max_cache_size_in_bytes, feat0)
-        self._feature = CPUFeatureCache(
-            (cache_size,) + feat0.shape[1:],
-            feat0.dtype,
-            policy=policy,
-            pin_memory=pin_memory,
-        )
-        self._is_pinned = pin_memory
-        self._offset = 0
+        self._feature = cache
+        self._offset = offset
 
     def read(self, ids: torch.Tensor = None):
         """Read the feature by index.
@@ -111,7 +97,7 @@ def read_async(self, ids: torch.Tensor):
         """
         policy = self._feature._policy
         cache = self._feature._cache
-        if ids.is_cuda and self._is_pinned:
+        if ids.is_cuda and self.is_pinned():
             ids_device = ids.device
             current_stream = torch.cuda.current_stream()
             device_to_host_stream = get_device_to_host_uva_stream()
@@ -450,18 +436,64 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
             feat0 = value[:1]
             self._fallback_feature.update(value)
             cache_size = min(
-                bytes_to_number_of_items(self.max_cache_size_in_bytes, feat0),
+                bytes_to_number_of_items(self.cache_size_in_bytes, feat0),
                 value.shape[0],
             )
             self._feature = None  # Destroy the existing cache first.
-            self._feature = CPUFeatureCache(
+            self._feature = self._cache_type(
                 (cache_size,) + feat0.shape[1:], feat0.dtype
             )
         else:
             self._fallback_feature.update(value, ids)
             self._feature.replace(ids, value, None, self._offset)
 
+    def is_pinned(self):
+        """Returns True if the cache storage is pinned."""
+        return self._feature.is_pinned()
+
+    @property
+    def cache_size_in_bytes(self):
+        """Return the size taken by the cache in bytes."""
+        return self._feature.max_size_in_bytes
+
     @property
     def miss_rate(self):
         """Returns the cache miss rate since creation."""
         return self._feature.miss_rate
+
+
+def cpu_cached_feature(
+    fallback_features: Union[Feature, Dict[FeatureKey, Feature]],
+    max_cache_size_in_bytes: int,
+    policy: Optional[str] = None,
+    pin_memory: bool = False,
+) -> Union[CPUCachedFeature, Dict[FeatureKey, CPUCachedFeature]]:
+    r"""CPU cached feature wrapping a fallback feature.
+
+    Parameters
+    ----------
+    fallback_features : Union[Feature, Dict[FeatureKey, Feature]]
+        The fallback feature(s).
+    max_cache_size_in_bytes : int
+        The capacity of the cache in bytes. The size should be a few factors
+        larger than the size of each read request. Otherwise, the caching policy
+        will hang due to all cache entries being read and/or write locked,
+        resulting in a deadlock.
+    policy : str, optional
+        The cache eviction policy algorithm name. The available policies are
+        ["s3-fifo", "sieve", "lru", "clock"]. Default is "sieve".
+    pin_memory : bool, optional
+        Whether the cache storage should be allocated on system pinned memory.
+        Default is False.
+    Returns
+    -------
+    Union[CPUCachedFeature, Dict[FeatureKey, CPUCachedFeature]]
+        New feature(s) wrapped with CPUCachedFeature.
+    """
+    return wrap_with_cached_feature(
+        CPUCachedFeature,
+        fallback_features,
+        max_cache_size_in_bytes,
+        policy=policy,
+        pin_memory=pin_memory,
+    )
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index c6903e208698..c676d890f118 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -1,24 +1,24 @@
 """GPU cached feature for GraphBolt."""
+from typing import Dict, Union
 
 import torch
 
-from ..feature_store import Feature
+from ..feature_store import (
+    bytes_to_number_of_items,
+    Feature,
+    FeatureKey,
+    wrap_with_cached_feature,
+)
 
 from .gpu_feature_cache import GPUFeatureCache
 
-__all__ = ["GPUCachedFeature"]
-
-
-def num_cache_items(cache_capacity_in_bytes, single_item):
-    """Returns the number of rows to be cached."""
-    item_bytes = single_item.nbytes
-    # Round up so that we never get a size of 0, unless bytes is 0.
-    return (cache_capacity_in_bytes + item_bytes - 1) // item_bytes
+__all__ = ["GPUCachedFeature", "gpu_cached_feature"]
 
 
 class GPUCachedFeature(Feature):
     r"""GPU cached feature wrapping a fallback feature. It uses the least
-    recently used (LRU) algorithm as the cache eviction policy.
+    recently used (LRU) algorithm as the cache eviction policy. Use
+    `gpu_feature_cache` to construct an instance of this class.
 
     Places the GPU cache to torch.cuda.current_device().
 
@@ -26,8 +26,12 @@ class GPUCachedFeature(Feature):
     ----------
     fallback_feature : Feature
         The fallback feature.
-    max_cache_size_in_bytes : int
-        The capacity of the GPU cache in bytes.
+    cache : GPUFeatureCache
+        A GPUFeatureCache instance to serve as the cache backend.
+    offset : int, optional
+        The offset value to add to the given ids before using the cache. This
+        parameter is useful if multiple `GPUCachedFeature`s are sharing a single
+        GPUFeatureCache object.
 
     Examples
     --------
@@ -36,7 +40,7 @@ class GPUCachedFeature(Feature):
     >>> torch_feat = torch.arange(10).reshape(2, -1).to("cuda")
     >>> cache_size = 5
     >>> fallback_feature = gb.TorchBasedFeature(torch_feat)
-    >>> feature = gb.GPUCachedFeature(fallback_feature, cache_size)
+    >>> feature = gb.gpu_cached_feature(fallback_feature, cache_size)
     >>> feature.read()
     tensor([[0, 1, 2, 3, 4],
             [5, 6, 7, 8, 9]], device='cuda:0')
@@ -51,21 +55,22 @@ class GPUCachedFeature(Feature):
     torch.Size([5])
     """
 
-    def __init__(self, fallback_feature: Feature, max_cache_size_in_bytes: int):
+    _cache_type = GPUFeatureCache
+
+    def __init__(
+        self,
+        fallback_feature: Feature,
+        cache: GPUFeatureCache,
+        offset: int = 0,
+    ):
         super(GPUCachedFeature, self).__init__()
         assert isinstance(fallback_feature, Feature), (
             f"The fallback_feature must be an instance of Feature, but got "
             f"{type(fallback_feature)}."
         )
         self._fallback_feature = fallback_feature
-        self.max_cache_size_in_bytes = max_cache_size_in_bytes
-        # Fetching the feature dimension from the underlying feature.
-        feat0 = fallback_feature.read(torch.tensor([0]))
-        cache_size = num_cache_items(max_cache_size_in_bytes, feat0)
-        self._feature = GPUFeatureCache(
-            (cache_size,) + feat0.shape[1:], feat0.dtype
-        )
-        self._offset = 0
+        self._feature = cache
+        self._offset = offset
 
     def read(self, ids: torch.Tensor = None):
         """Read the feature by index.
@@ -231,18 +236,48 @@ def update(self, value: torch.Tensor, ids: torch.Tensor = None):
             feat0 = value[:1]
             self._fallback_feature.update(value)
             cache_size = min(
-                num_cache_items(self.max_cache_size_in_bytes, feat0),
+                bytes_to_number_of_items(self.cache_size_in_bytes, feat0),
                 value.shape[0],
             )
             self._feature = None  # Destroy the existing cache first.
-            self._feature = GPUFeatureCache(
+            self._feature = self._cache_type(
                 (cache_size,) + feat0.shape[1:], feat0.dtype
             )
         else:
             self._fallback_feature.update(value, ids)
             self._feature.replace(ids, value)
 
+    @property
+    def cache_size_in_bytes(self):
+        """Return the size taken by the cache in bytes."""
+        return self._feature.max_size_in_bytes
+
     @property
     def miss_rate(self):
         """Returns the cache miss rate since creation."""
         return self._feature.miss_rate
+
+
+def gpu_cached_feature(
+    fallback_features: Union[Feature, Dict[FeatureKey, Feature]],
+    max_cache_size_in_bytes: int,
+) -> Union[GPUCachedFeature, Dict[FeatureKey, GPUCachedFeature]]:
+    r"""GPU cached feature wrapping a fallback feature. It uses the least
+    recently used (LRU) algorithm as the cache eviction policy.
+
+    Places the GPU cache to torch.cuda.current_device().
+
+    Parameters
+    ----------
+    fallback_features : Union[Feature, Dict[FeatureKey, Feature]]
+        The fallback feature(s).
+    max_cache_size_in_bytes : int
+        The capacity of the GPU cache in bytes.
+    Returns
+    -------
+    Union[GPUCachedFeature, Dict[FeatureKey, GPUCachedFeature]]
+        The feature(s) wrapped with GPUCachedFeature.
+    """
+    return wrap_with_cached_feature(
+        GPUCachedFeature, fallback_features, max_cache_size_in_bytes
+    )
diff --git a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
index ea8c5e9122a9..f93913c72f71 100644
--- a/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_cpu_cached_feature.py
@@ -43,10 +43,10 @@ def test_cpu_cached_feature(dtype, policy):
     cache_size_a *= a[:1].nbytes
     cache_size_b *= b[:1].nbytes
 
-    feat_store_a = gb.CPUCachedFeature(
+    feat_store_a = gb.cpu_cached_feature(
         gb.TorchBasedFeature(a), cache_size_a, policy, pin_memory
     )
-    feat_store_b = gb.CPUCachedFeature(
+    feat_store_b = gb.cpu_cached_feature(
         gb.TorchBasedFeature(b), cache_size_b, policy, pin_memory
     )
 
@@ -129,7 +129,7 @@ def test_cpu_cached_feature_read_async(dtype):
 
     cache_size = 256 * a[:1].nbytes
 
-    feat_store = gb.CPUCachedFeature(gb.TorchBasedFeature(a), cache_size)
+    feat_store = gb.cpu_cached_feature(gb.TorchBasedFeature(a), cache_size)
 
     # Test read with ids.
     ids1 = torch.tensor([0, 15, 71, 101])
@@ -170,7 +170,7 @@ def test_cpu_cached_disk_feature_read_async(dtype):
     with tempfile.TemporaryDirectory() as test_dir:
         path = to_on_disk_numpy(test_dir, "tensor", a)
 
-        feat_store = gb.CPUCachedFeature(
+        feat_store = gb.cpu_cached_feature(
             gb.DiskBasedFeature(path=path), cache_size
         )
 
diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
index 4e2e2fabcd91..5d503aabda59 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_cached_feature.py
@@ -48,8 +48,8 @@ def test_gpu_cached_feature(dtype, cache_size_a, cache_size_b):
     cache_size_a *= a[:1].element_size() * a[:1].numel()
     cache_size_b *= b[:1].element_size() * b[:1].numel()
 
-    feat_store_a = gb.GPUCachedFeature(gb.TorchBasedFeature(a), cache_size_a)
-    feat_store_b = gb.GPUCachedFeature(gb.TorchBasedFeature(b), cache_size_b)
+    feat_store_a = gb.gpu_cached_feature(gb.TorchBasedFeature(a), cache_size_a)
+    feat_store_b = gb.gpu_cached_feature(gb.TorchBasedFeature(b), cache_size_b)
 
     # Test read the entire feature.
     assert torch.equal(feat_store_a.read(), a.to("cuda"))
@@ -142,7 +142,7 @@ def test_gpu_cached_feature_read_async(dtype, pin_memory):
 
     cache_size = 256 * a[:1].nbytes
 
-    feat_store = gb.GPUCachedFeature(gb.TorchBasedFeature(a), cache_size)
+    feat_store = gb.gpu_cached_feature(gb.TorchBasedFeature(a), cache_size)
 
     # Test read with ids.
     ids1 = torch.tensor([0, 15, 71, 101], device=F.ctx())
@@ -189,12 +189,12 @@ def test_gpu_cached_nested_feature_async(dtype):
         path = to_on_disk_numpy(test_dir, "tensor", a)
 
         disk_store = gb.DiskBasedFeature(path=path)
-        feat_store1 = gb.GPUCachedFeature(disk_store, cache_size)
-        feat_store2 = gb.GPUCachedFeature(
-            gb.CPUCachedFeature(disk_store, cache_size * 2), cache_size
+        feat_store1 = gb.gpu_cached_feature(disk_store, cache_size)
+        feat_store2 = gb.gpu_cached_feature(
+            gb.cpu_cached_feature(disk_store, cache_size * 2), cache_size
         )
-        feat_store3 = gb.GPUCachedFeature(
-            gb.CPUCachedFeature(disk_store, cache_size * 2, pin_memory=True),
+        feat_store3 = gb.gpu_cached_feature(
+            gb.cpu_cached_feature(disk_store, cache_size * 2, pin_memory=True),
             cache_size,
         )
 
diff --git a/tests/python/pytorch/graphbolt/impl/test_hetero_cached_feature.py b/tests/python/pytorch/graphbolt/impl/test_hetero_cached_feature.py
new file mode 100644
index 000000000000..620999e2a6a4
--- /dev/null
+++ b/tests/python/pytorch/graphbolt/impl/test_hetero_cached_feature.py
@@ -0,0 +1,40 @@
+import backend as F
+
+import pytest
+import torch
+
+from dgl import graphbolt as gb
+
+
+@pytest.mark.parametrize(
+    "cached_feature_type", [gb.cpu_cached_feature, gb.gpu_cached_feature]
+)
+def test_hetero_cached_feature(cached_feature_type):
+    if cached_feature_type == gb.gpu_cached_feature and (
+        F._default_context_str != "gpu"
+        or torch.cuda.get_device_capability()[0] < 7
+    ):
+        pytest.skip(
+            "GPUCachedFeature requires a Volta or later generation NVIDIA GPU."
+        )
+    device = F.ctx() if cached_feature_type == gb.gpu_cached_feature else None
+    pin_memory = cached_feature_type == gb.gpu_cached_feature
+
+    a = {
+        ("node", str(i), "feat"): gb.TorchBasedFeature(
+            torch.randn([(i + 1) * 10, 5], pin_memory=pin_memory)
+        )
+        for i in range(75)
+    }
+    cached_a = cached_feature_type(a, 2**18)
+
+    for i in range(1024):
+        etype = i % len(a)
+        ids = torch.randint(
+            0, (etype + 1) * 10 - 1, ((etype + 1) * 4,), device=device
+        )
+        feature_key = ("node", str(etype), "feat")
+        ref = a[feature_key].read(ids)
+        val = cached_a[feature_key].read(ids)
+        torch.testing.assert_close(ref, val, rtol=0, atol=0)
+    assert cached_a[feature_key].miss_rate < 0.69

From 1d378f8f83770ed9d5d7591f5f7fd2a3390ee6f4 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 22 Aug 2024 18:21:42 -0400
Subject: [PATCH 43/78] [GraphBolt][PyG] Heterogenous example. (#7722)

---
 .../pyg/hetero/node_classification.py         | 548 ++++++++++++++++++
 1 file changed, 548 insertions(+)
 create mode 100644 examples/graphbolt/pyg/hetero/node_classification.py

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
new file mode 100644
index 000000000000..836b907941ef
--- /dev/null
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -0,0 +1,548 @@
+"""
+This script is a PyG counterpart of ``/examples/graphbolt/rgcn/hetero_rgcn.py``.
+"""
+
+import argparse
+import time
+
+import dgl.graphbolt as gb
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.nn import SimpleConv
+from tqdm import tqdm
+
+
+def accuracy(out, labels):
+    assert out.ndim == 2
+    assert out.size(0) == labels.size(0)
+    assert labels.ndim == 1 or (labels.ndim == 2 and labels.size(1) == 1)
+    labels = labels.flatten()
+    predictions = torch.argmax(out, 1)
+    return (labels == predictions).sum(dtype=torch.float64) / labels.size(0)
+
+
+def create_dataloader(
+    graph,
+    features,
+    itemset,
+    batch_size,
+    fanout,
+    device,
+    job,
+):
+    """Create a GraphBolt dataloader for training, validation or testing."""
+    datapipe = gb.ItemSampler(
+        itemset,
+        batch_size=batch_size,
+        shuffle=(job == "train"),
+        drop_last=(job == "train"),
+    )
+    need_copy = True
+    # Copy the data to the specified device.
+    if args.graph_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+    # Sample neighbors for each node in the mini-batch.
+    datapipe = getattr(datapipe, args.sample_mode)(
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        num_gpu_cached_edges=args.num_gpu_cached_edges,
+        gpu_cache_threshold=args.gpu_graph_caching_threshold,
+        asynchronous=args.graph_device != "cpu",
+    )
+    # Copy the data to the specified device.
+    if args.feature_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+
+    if args.dataset == "ogb-lsc-mag240m":
+        node_feature_keys = {
+            "paper": ["feat"],
+            "author": ["feat"],
+            "institution": ["feat"],
+        }
+    # Fetch node features for the sampled subgraph.
+    datapipe = datapipe.fetch_feature(features, node_feature_keys)
+
+    # Copy the data to the specified device.
+    if need_copy:
+        datapipe = datapipe.copy_to(device=device)
+    # Create and return a DataLoader to handle data loading.
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
+
+
+def convert_to_pyg(h, subgraph):
+    #####################################################################
+    # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
+    #
+    #   We convert the provided sampled edges in CSC format from GraphBolt and
+    #   convert to COO via using gb.expand_indptr.
+    #####################################################################
+    h_dst_dict = {}
+    edge_index_dict = {}
+    sizes_dict = {}
+    for etype, sampled_csc in subgraph.sampled_csc.items():
+        src = sampled_csc.indices
+        dst = gb.expand_indptr(
+            sampled_csc.indptr,
+            dtype=src.dtype,
+            output_size=src.size(0),
+        )
+        edge_index = torch.stack([src, dst], dim=0).long()
+        dst_size = sampled_csc.indptr.size(0) - 1
+        # h and h[:dst_size] correspond to source and destination features resp.
+        src_ntype, _, dst_ntype = gb.etype_str_to_tuple(etype)
+        h_dst_dict[dst_ntype] = h[dst_ntype][:dst_size]
+        edge_index_dict[etype] = edge_index
+        sizes_dict[etype] = (h[src_ntype].size(0), dst_size)
+
+    return (h, h_dst_dict), edge_index_dict, sizes_dict
+
+
+class RelGraphConvLayer(nn.Module):
+    def __init__(
+        self,
+        in_size,
+        out_size,
+        ntypes,
+        etypes,
+        activation,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.in_size = in_size
+        self.out_size = out_size
+        self.activation = activation
+
+        # Create a separate convolution layer for each relationship. PyG's
+        # SimpleConv does not have any weights and only performs message passing
+        # and aggregation.
+        self.convs = nn.ModuleDict(
+            {etype: SimpleConv(aggr="mean") for etype in etypes}
+        )
+
+        # Create a separate Linear layer for each relationship. Each
+        # relationship has its own weights which will be applied to the node
+        # features before performing convolution.
+        self.weight = nn.ModuleDict(
+            {
+                etype: nn.Linear(in_size, out_size, bias=False)
+                for etype in etypes
+            }
+        )
+
+        # Create a separate Linear layer for each node type.
+        # loop_weights are used to update the output embedding of each target node
+        # based on its own features, thereby allowing the model to refine the node
+        # representations. Note that this does not imply the existence of self-loop
+        # edges in the graph. It is similar to residual connection.
+        self.loop_weights = nn.ModuleDict(
+            {ntype: nn.Linear(in_size, out_size, bias=True) for ntype in ntypes}
+        )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, subgraph, x):
+        # Create a dictionary of node features for the destination nodes in
+        # the graph. We slice the node features according to the number of
+        # destination nodes of each type. This is necessary because when
+        # incorporating the effect of self-loop edges, we perform computations
+        # only on the destination nodes' features. By doing so, we ensure the
+        # feature dimensions match and prevent any misuse of incorrect node
+        # features.
+        (h, h_dst), edge_index, size = convert_to_pyg(x, subgraph)
+
+        h_out = {}
+        for etype in edge_index:
+            src_ntype, _, dst_ntype = gb.etype_str_to_tuple(etype)
+            # h_dst is unused in SimpleConv.
+            t = self.convs[etype](
+                (h[src_ntype], h_dst[dst_ntype]),
+                edge_index[etype],
+                size=size[etype],
+            )
+            t = self.weight[etype](t)
+            if dst_ntype in h_out:
+                h_out[dst_ntype] += t
+            else:
+                h_out[dst_ntype] = t
+
+        def _apply(ntype, x):
+            # Apply the `loop_weight` to the input node features, effectively
+            # acting as a residual connection. This allows the model to refine
+            # node embeddings based on its current features.
+            x = x + self.loop_weights[ntype](h_dst[ntype])
+            return self.dropout(self.activation(x))
+
+        # Apply the function defined above for each node type. This will update
+        # the node features using the `loop_weights`, apply the activation
+        # function and dropout.
+        return {ntype: _apply(ntype, h) for ntype, h in h_out.items()}
+
+
+class EntityClassify(nn.Module):
+    def __init__(self, graph, in_size, hidden_size, out_size, n_layers):
+        super(EntityClassify, self).__init__()
+        self.layers = nn.ModuleList()
+        sizes = [in_size] + [hidden_size] * (n_layers - 1) + [out_size]
+        for i in range(n_layers):
+            self.layers.append(
+                RelGraphConvLayer(
+                    sizes[i],
+                    sizes[i + 1],
+                    graph.node_type_to_id.keys(),
+                    graph.edge_type_to_id.keys(),
+                    activation=F.relu if i != n_layers - 1 else lambda x: x,
+                    dropout=0.5,
+                )
+            )
+
+    def forward(self, subgraphs, h):
+        for layer, subgraph in zip(self.layers, subgraphs):
+            h = layer(subgraph, h)
+        return h
+
+
+@torch.compile
+def evaluate_step(minibatch, model):
+    category = "paper"
+    node_features = {
+        ntype: feat.float()
+        for (ntype, name), feat in minibatch.node_features.items()
+        if name == "feat"
+    }
+    labels = minibatch.labels[category].long()
+    out = model(minibatch.sampled_subgraphs, node_features)[category]
+    num_correct = accuracy(out, labels) * labels.size(0)
+    return num_correct, labels.size(0)
+
+
+@torch.no_grad()
+def evaluate(
+    model,
+    dataloader,
+    gpu_cache_miss_rate_fn,
+    cpu_cache_miss_rate_fn,
+    device,
+):
+    model.eval()
+    total_correct = torch.zeros(1, dtype=torch.float64, device=device)
+    total_samples = 0
+    dataloader = tqdm(dataloader, desc="Evaluating")
+    for step, minibatch in enumerate(dataloader):
+        num_correct, num_samples = evaluate_step(minibatch, model)
+        total_correct += num_correct
+        total_samples += num_samples
+        if step % 15 == 0:
+            num_nodes = sum(id.size(0) for id in minibatch.node_ids().values())
+            dataloader.set_postfix(
+                {
+                    "num_nodes": num_nodes,
+                    "gpu_cache_miss": gpu_cache_miss_rate_fn(),
+                    "cpu_cache_miss": cpu_cache_miss_rate_fn(),
+                }
+            )
+
+    return total_correct / total_samples
+
+
+@torch.compile
+def train_step(minibatch, optimizer, model, loss_fn):
+    category = "paper"
+    node_features = {
+        ntype: feat.float()
+        for (ntype, name), feat in minibatch.node_features.items()
+        if name == "feat"
+    }
+    labels = minibatch.labels[category].long()
+    optimizer.zero_grad()
+    out = model(minibatch.sampled_subgraphs, node_features)[category]
+    loss = loss_fn(out, labels)
+    # https://github.com/pytorch/pytorch/issues/133942
+    # num_correct = accuracy(out, labels) * labels.size(0)
+    num_correct = torch.zeros(1, dtype=torch.float64, device=out.device)
+    loss.backward()
+    optimizer.step()
+    return loss.detach(), num_correct, labels.size(0)
+
+
+def train_helper(
+    dataloader,
+    model,
+    optimizer,
+    loss_fn,
+    gpu_cache_miss_rate_fn,
+    cpu_cache_miss_rate_fn,
+    device,
+):
+    model.train()
+    total_loss = torch.zeros(1, device=device)
+    total_correct = torch.zeros(1, dtype=torch.float64, device=device)
+    total_samples = 0
+    start = time.time()
+    dataloader = tqdm(dataloader, "Training")
+    for step, minibatch in enumerate(dataloader):
+        loss, num_correct, num_samples = train_step(
+            minibatch, optimizer, model, loss_fn
+        )
+        total_loss += loss * num_samples
+        total_correct += num_correct
+        total_samples += num_samples
+        if step % 15 == 0:
+            # log every 15 steps for performance.
+            num_nodes = sum(id.size(0) for id in minibatch.node_ids().values())
+            dataloader.set_postfix(
+                {
+                    "num_nodes": num_nodes,
+                    "gpu_cache_miss": gpu_cache_miss_rate_fn(),
+                    "cpu_cache_miss": cpu_cache_miss_rate_fn(),
+                }
+            )
+    loss = total_loss / total_samples
+    acc = total_correct / total_samples
+    end = time.time()
+    return loss, acc, end - start
+
+
+def train(
+    train_dataloader,
+    valid_dataloader,
+    model,
+    gpu_cache_miss_rate_fn,
+    cpu_cache_miss_rate_fn,
+    device,
+):
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fn = nn.CrossEntropyLoss()
+
+    for epoch in range(args.epochs):
+        train_loss, train_acc, duration = train_helper(
+            train_dataloader,
+            model,
+            optimizer,
+            loss_fn,
+            gpu_cache_miss_rate_fn,
+            cpu_cache_miss_rate_fn,
+            device,
+        )
+        val_acc = evaluate(
+            model,
+            valid_dataloader,
+            gpu_cache_miss_rate_fn,
+            cpu_cache_miss_rate_fn,
+            device,
+        )
+        print(
+            f"Epoch: {epoch:02d}, Loss: {train_loss.item():.4f}, "
+            f"Approx. Train: {train_acc.item():.4f}, "
+            f"Approx. Val: {val_acc.item():.4f}, "
+            f"Time: {duration}s"
+        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="GraphBolt PyG R-SAGE")
+    parser.add_argument(
+        "--epochs", type=int, default=10, help="Number of training epochs."
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=0.001,
+        help="Learning rate for optimization.",
+    )
+    parser.add_argument("--num-hidden", type=int, default=1024)
+    parser.add_argument(
+        "--batch-size", type=int, default=1024, help="Batch size for training."
+    )
+    parser.add_argument("--num_workers", type=int, default=0)
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="ogb-lsc-mag240m",
+        choices=["ogb-lsc-mag240m"],
+        help="Dataset name. Possible values: ogb-lsc-mag240m",
+    )
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="25,10",
+        help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)"
+        " identical with the number of layers in your model. Default: 25,10",
+    )
+    parser.add_argument(
+        "--mode",
+        default="pinned-pinned-cuda",
+        choices=[
+            "cpu-cpu-cpu",
+            "cpu-cpu-cuda",
+            "cpu-pinned-cuda",
+            "pinned-pinned-cuda",
+            "cuda-pinned-cuda",
+            "cuda-cuda-cuda",
+        ],
+        help="Graph storage - feature storage - Train device: 'cpu' for CPU and RAM,"
+        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
+    )
+    parser.add_argument(
+        "--sample-mode",
+        default="sample_neighbor",
+        choices=["sample_neighbor", "sample_layer_neighbor"],
+        help="The sampling function when doing layerwise sampling.",
+    )
+    parser.add_argument(
+        "--cpu-feature-cache-policy",
+        type=str,
+        default=None,
+        choices=["s3-fifo", "sieve", "lru", "clock"],
+        help="The cache policy for the CPU feature cache.",
+    )
+    parser.add_argument(
+        "--cpu-cache-size",
+        type=float,
+        default=0,
+        help="The capacity of the CPU feature cache in GiB.",
+    )
+    parser.add_argument(
+        "--gpu-cache-size",
+        type=float,
+        default=0,
+        help="The capacity of the GPU feature cache in GiB.",
+    )
+    parser.add_argument(
+        "--num-gpu-cached-edges",
+        type=int,
+        default=0,
+        help="The number of edges to be cached from the graph on the GPU.",
+    )
+    parser.add_argument(
+        "--gpu-graph-caching-threshold",
+        type=int,
+        default=1,
+        help="The number of accesses after which a vertex neighborhood will be cached.",
+    )
+    parser.add_argument("--precision", type=str, default="high")
+    return parser.parse_args()
+
+
+def main():
+    torch.set_float32_matmul_precision(args.precision)
+    if not torch.cuda.is_available():
+        args.mode = "cpu-cpu-cpu"
+    print(f"Training in {args.mode} mode.")
+    args.graph_device, args.feature_device, args.device = args.mode.split("-")
+    args.overlap_feature_fetch = args.feature_device == "pinned"
+    args.overlap_graph_fetch = args.graph_device == "pinned"
+
+    # Load dataset.
+    dataset = gb.BuiltinDataset(args.dataset).load()
+    print("Dataset loaded")
+
+    # Move the dataset to the selected storage.
+    graph = (
+        dataset.graph.pin_memory_()
+        if args.graph_device == "pinned"
+        else dataset.graph.to(args.graph_device)
+    )
+    features = (
+        dataset.feature.pin_memory_()
+        if args.feature_device == "pinned"
+        else dataset.feature.to(args.feature_device)
+    )
+
+    train_set = dataset.tasks[0].train_set
+    valid_set = dataset.tasks[0].validation_set
+    test_set = dataset.tasks[0].test_set
+    args.fanout = list(map(int, args.fanout.split(",")))
+
+    num_classes = dataset.tasks[0].metadata["num_classes"]
+    num_etypes = len(graph.num_edges)
+
+    feats_on_disk = {
+        k: features[k]
+        for k in features.keys()
+        if k[2] == "feat" and isinstance(features[k], gb.DiskBasedFeature)
+    }
+
+    if args.cpu_cache_size > 0 and len(feats_on_disk) > 0:
+        cached_features = gb.cpu_cached_feature(
+            feats_on_disk,
+            int(args.cpu_cache_size * (2**30)),
+            args.cpu_feature_cache_policy,
+            args.feature_device == "pinned",
+        )
+        for k, cpu_cached_feature in cached_features.items():
+            features[k] = cpu_cached_feature
+            cpu_cache_miss_rate_fn = lambda: cpu_cached_feature.miss_rate
+    else:
+        cpu_cache_miss_rate_fn = lambda: 1
+
+    if args.gpu_cache_size > 0 and args.feature_device != "cuda":
+        feats = {k: features[k] for k in features.keys() if k[2] == "feat"}
+        cached_features = gb.gpu_cached_feature(
+            feats,
+            int(args.gpu_cache_size * (2**30)),
+        )
+        for k, gpu_cached_feature in cached_features.items():
+            features[k] = gpu_cached_feature
+            gpu_cache_miss_rate_fn = lambda: gpu_cached_feature.miss_rate
+    else:
+        gpu_cache_miss_rate_fn = lambda: 1
+
+    train_dataloader, valid_dataloader, test_dataloader = (
+        create_dataloader(
+            graph=graph,
+            features=features,
+            itemset=itemset,
+            batch_size=args.batch_size,
+            fanout=[
+                torch.full((num_etypes,), fanout) for fanout in args.fanout
+            ],
+            device=args.device,
+            job=job,
+        )
+        for itemset, job in zip(
+            [train_set, valid_set, test_set], ["train", "evaluate", "evaluate"]
+        )
+    )
+
+    feat_size = features.size("node", "paper", "feat")[0]
+    hidden_channels = args.num_hidden
+
+    # Initialize the entity classification model.
+    model = EntityClassify(
+        graph, feat_size, hidden_channels, num_classes, 3
+    ).to(args.device)
+
+    print(
+        "Number of model parameters: "
+        f"{sum(p.numel() for p in model.parameters())}"
+    )
+
+    train(
+        train_dataloader,
+        valid_dataloader,
+        model,
+        gpu_cache_miss_rate_fn,
+        cpu_cache_miss_rate_fn,
+        args.device,
+    )
+
+    # Labels are currently unavailable for mag240M so the test acc will be 0.
+    print("Testing...")
+    test_acc = evaluate(
+        model,
+        test_dataloader,
+        gpu_cache_miss_rate_fn,
+        cpu_cache_miss_rate_fn,
+        args.device,
+    )
+    print(f"Test accuracy {test_acc.item():.4f}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main()

From b3eacd22d7c8e772d29d42011d9a569b64a89579 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 22 Aug 2024 18:22:10 -0400
Subject: [PATCH 44/78] [GraphBolt] Refine examples. (#7733)

---
 .../disk_based_feature/node_classification.py          |  4 ++--
 examples/graphbolt/pyg/labor/node_classification.py    |  6 +++---
 examples/graphbolt/pyg/node_classification_advanced.py | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index be477e23c3d7..83b2f4af45d1 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -376,13 +376,13 @@ def parse_args():
         "--cpu-cache-size-in-gigabytes",
         type=float,
         default=0,
-        help="The capacity of the CPU cache, the number of features to store.",
+        help="The capacity of the CPU cache in GiB.",
     )
     parser.add_argument(
         "--gpu-cache-size-in-gigabytes",
         type=float,
         default=0,
-        help="The capacity of the GPU cache, the number of features to store.",
+        help="The capacity of the GPU cache in GiB.",
     )
     parser.add_argument("--early-stopping-patience", type=int, default=25)
     parser.add_argument(
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index 8c1cef5f4915..7cf0834b7cc8 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -333,13 +333,13 @@ def evaluate(
     model.eval()
     total_correct = torch.zeros(1, dtype=torch.float64, device=device)
     total_samples = 0
-    val_dataloader_tqdm = tqdm(dataloader, "Evaluating")
-    for step, minibatch in enumerate(val_dataloader_tqdm):
+    dataloader = tqdm(dataloader, "Evaluating")
+    for step, minibatch in enumerate(dataloader):
         num_correct, num_samples = evaluate_step(minibatch, model, eval_fn)
         total_correct += num_correct
         total_samples += num_samples
         if step % 25 == 0:
-            val_dataloader_tqdm.set_postfix(
+            dataloader.set_postfix(
                 {
                     "num_nodes": minibatch.node_ids().size(0),
                     "gpu_cache_miss": gpu_cache_miss_rate_fn(),
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 44eea9566573..e55da747db62 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -232,7 +232,7 @@ def train_step(minibatch, optimizer, model, loss_fn):
     return loss.detach(), num_correct, labels.size(0)
 
 
-def train_helper(dataloader, model, optimizer, loss_fn, num_classes, device):
+def train_helper(dataloader, model, optimizer, loss_fn, device):
     model.train()  # Set the model to training mode
     total_loss = torch.zeros(1, device=device)  # Accumulator for the total loss
     # Accumulator for the total number of correct predictions
@@ -254,7 +254,7 @@ def train_helper(dataloader, model, optimizer, loss_fn, num_classes, device):
     return train_loss, train_acc, end - start
 
 
-def train(train_dataloader, valid_dataloader, num_classes, model, device):
+def train(train_dataloader, valid_dataloader, model, device):
     #####################################################################
     # (HIGHLIGHT) Train the model for one epoch.
     #
@@ -276,7 +276,7 @@ def train(train_dataloader, valid_dataloader, num_classes, model, device):
 
     for epoch in range(args.epochs):
         train_loss, train_acc, duration = train_helper(
-            train_dataloader, model, optimizer, loss_fn, num_classes, device
+            train_dataloader, model, optimizer, loss_fn, device
         )
         val_acc = evaluate(model, valid_dataloader, device)
         print(
@@ -363,7 +363,7 @@ def parse_args():
         type=str,
         default="10,10,10",
         help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)"
-        " identical with the number of layers in your model. Default: 5,10,15",
+        " identical with the number of layers in your model. Default: 10,10,10",
     )
     parser.add_argument(
         "--mode",
@@ -466,7 +466,7 @@ def main():
     ).to(args.device)
     assert len(args.fanout) == len(model.layers)
 
-    train(train_dataloader, valid_dataloader, num_classes, model, args.device)
+    train(train_dataloader, valid_dataloader, model, args.device)
 
     # Test the model.
     print("Testing...")

From 37d1064c22c46a9373c265e3cf264bd1b3804049 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 23 Aug 2024 01:02:17 -0400
Subject: [PATCH 45/78] [GraphBolt] `gb.DataLoader` can simply be a datapipe.
 (#7732)

---
 python/dgl/graphbolt/dataloader.py            |  55 +++--
 .../pytorch/graphbolt/test_dataloader.py      |   3 +-
 .../pytorch/graphbolt/test_integration.py     | 211 +++++++++---------
 3 files changed, 129 insertions(+), 140 deletions(-)

diff --git a/python/dgl/graphbolt/dataloader.py b/python/dgl/graphbolt/dataloader.py
index b964f71efe40..10f9eee4c1b9 100644
--- a/python/dgl/graphbolt/dataloader.py
+++ b/python/dgl/graphbolt/dataloader.py
@@ -4,7 +4,6 @@
 import torch.utils.data as torch_data
 
 from .base import CopyTo
-
 from .datapipes import (
     datapipe_graph_to_adjlist,
     find_dps,
@@ -15,6 +14,7 @@
 from .impl.neighbor_sampler import SamplePerLayer
 from .internal_utils import gb_warning
 from .item_sampler import ItemSampler
+from .minibatch_transformer import MiniBatchTransformer
 
 
 __all__ = [
@@ -75,7 +75,7 @@ def __iter__(self):
         yield from self.dataloader
 
 
-class DataLoader(torch_data.DataLoader):
+class DataLoader(MiniBatchTransformer):
     """Multiprocessing DataLoader.
 
     Iterates over the data pipeline with everything before feature fetching
@@ -122,32 +122,33 @@ def __init__(
         datapipe = datapipe.mark_end()
         datapipe_graph = traverse_dps(datapipe)
 
-        # (1) Insert minibatch distribution.
-        # TODO(BarclayII): Currently I'm using sharding_filter() as a
-        # concept demonstration. Later on minibatch distribution should be
-        # merged into ItemSampler to maximize efficiency.
-        item_samplers = find_dps(
-            datapipe_graph,
-            ItemSampler,
-        )
-        for item_sampler in item_samplers:
-            datapipe_graph = replace_dp(
+        if num_workers > 0:
+            # (1) Insert minibatch distribution.
+            # TODO(BarclayII): Currently I'm using sharding_filter() as a
+            # concept demonstration. Later on minibatch distribution should be
+            # merged into ItemSampler to maximize efficiency.
+            item_samplers = find_dps(
                 datapipe_graph,
-                item_sampler,
-                item_sampler.sharding_filter(),
+                ItemSampler,
+            )
+            for item_sampler in item_samplers:
+                datapipe_graph = replace_dp(
+                    datapipe_graph,
+                    item_sampler,
+                    item_sampler.sharding_filter(),
+                )
+
+            # (2) Cut datapipe at FeatureFetcher and wrap.
+            datapipe_graph = _find_and_wrap_parent(
+                datapipe_graph,
+                FeatureFetcherStartMarker,
+                MultiprocessingWrapper,
+                num_workers=num_workers,
+                persistent_workers=persistent_workers,
             )
 
-        # (2) Cut datapipe at FeatureFetcher and wrap.
-        datapipe_graph = _find_and_wrap_parent(
-            datapipe_graph,
-            FeatureFetcherStartMarker,
-            MultiprocessingWrapper,
-            num_workers=num_workers,
-            persistent_workers=persistent_workers,
-        )
-
-        # (3) Limit the number of UVA threads used if the feature_fetcher has
-        # overlapping optimization enabled.
+        # (3) Limit the number of UVA threads used if the feature_fetcher
+        # or any of the samplers have overlapping optimization enabled.
         if num_workers == 0 and torch.cuda.is_available():
             feature_fetchers = find_dps(
                 datapipe_graph,
@@ -187,6 +188,4 @@ def __init__(
                 ),
             )
 
-        # The stages after feature fetching is still done in the main process.
-        # So we set num_workers to 0 here.
-        super().__init__(datapipe, batch_size=None, num_workers=0)
+        super().__init__(datapipe)
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 92c670055361..47e2e7062038 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -138,8 +138,7 @@ def test_gpu_sampling_DataLoader(
             bufferer_cnt += 2 * num_layers
     if asynchronous:
         bufferer_cnt += 2 * num_layers
-    datapipe = dataloader.dataset
-    datapipe_graph = traverse_dps(datapipe)
+    datapipe_graph = traverse_dps(dataloader)
     bufferers = find_dps(
         datapipe_graph,
         dgl.graphbolt.Bufferer,
diff --git a/tests/python/pytorch/graphbolt/test_integration.py b/tests/python/pytorch/graphbolt/test_integration.py
index dca774378d47..228c1298896d 100644
--- a/tests/python/pytorch/graphbolt/test_integration.py
+++ b/tests/python/pytorch/graphbolt/test_integration.py
@@ -64,55 +64,54 @@ def test_integration_link_prediction():
                         [3, 2],
                         [3, 2],
                         [3, 3],
-                        [5, 0],
-                        [5, 0],
+                        [5, 2],
+                        [5, 1],
+                        [3, 4],
                         [3, 3],
-                        [3, 0],
                         [3, 5],
-                        [3, 3],
-                        [3, 3],
+                        [3, 2],
+                        [3, 0],
                         [3, 4]]),
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 2, 2, 2, 3], dtype=torch.int32),
-                                                                         indices=tensor([0, 5, 4], dtype=torch.int32),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 2, 2], dtype=torch.int32),
+                                                                         indices=tensor([4, 5], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([5, 1, 3, 2, 0, 4]),
-                                               original_edge_ids=tensor([8, 5, 7]),
-                                               original_column_node_ids=tensor([5, 1, 3, 2, 0, 4]),
+                                               original_row_node_ids=tensor([5, 1, 3, 2, 4, 0]),
+                                               original_edge_ids=tensor([9, 7]),
+                                               original_column_node_ids=tensor([5, 1, 3, 2, 4, 0]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([5, 4], dtype=torch.int32),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1, 1, 1, 2, 2], dtype=torch.int32),
+                                                                         indices=tensor([0, 5], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([5, 1, 3, 2, 0, 4]),
-                                               original_edge_ids=tensor([9, 7]),
-                                               original_column_node_ids=tensor([5, 1, 3, 2, 0, 4]),
+                                               original_row_node_ids=tensor([5, 1, 3, 2, 4, 0]),
+                                               original_edge_ids=tensor([8, 7]),
+                                               original_column_node_ids=tensor([5, 1, 3, 2, 4, 0]),
                             )],
           node_features={'feat': tensor([[0.5160, 0.2486],
                                 [0.6172, 0.7865],
                                 [0.8672, 0.2276],
                                 [0.2109, 0.1089],
-                                [0.9634, 0.2294],
-                                [0.5503, 0.8223]])},
+                                [0.5503, 0.8223],
+                                [0.9634, 0.2294]])},
           labels=tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
-          input_nodes=tensor([5, 1, 3, 2, 0, 4]),
+          input_nodes=tensor([5, 1, 3, 2, 4, 0]),
           indexes=tensor([0, 1, 2, 3, 0, 0, 1, 1, 2, 2, 3, 3]),
-          edge_features=[{'feat': tensor([[0.8972, 0.7511, 0.3617],
-                                [0.7885, 0.3414, 0.5485],
+          edge_features=[{'feat': tensor([[0.5773, 0.2199, 0.3366],
                                 [0.0056, 0.9469, 0.4432]])},
-                        {'feat': tensor([[0.5773, 0.2199, 0.3366],
+                        {'feat': tensor([[0.8972, 0.7511, 0.3617],
                                 [0.0056, 0.9469, 0.4432]])}],
           compacted_seeds=tensor([[0, 1],
                                   [2, 3],
                                   [2, 3],
                                   [2, 2],
-                                  [0, 4],
-                                  [0, 4],
-                                  [2, 2],
+                                  [0, 3],
+                                  [0, 1],
                                   [2, 4],
-                                  [2, 0],
-                                  [2, 2],
                                   [2, 2],
-                                  [2, 5]]),
-          blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3),
+                                  [2, 0],
+                                  [2, 3],
+                                  [2, 5],
+                                  [2, 4]]),
+          blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2),
                  Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2)],
        )"""
         ),
@@ -121,103 +120,97 @@ def test_integration_link_prediction():
                         [4, 3],
                         [4, 4],
                         [0, 4],
-                        [3, 1],
+                        [3, 4],
                         [3, 5],
-                        [4, 2],
-                        [4, 5],
+                        [4, 1],
                         [4, 4],
-                        [4, 3],
+                        [4, 4],
+                        [4, 5],
                         [0, 1],
-                        [0, 5]]),
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([4, 0], dtype=torch.int32),
+                        [0, 3]]),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 0, 1], dtype=torch.int32),
+                                                                         indices=tensor([3], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]),
-                                               original_edge_ids=tensor([0, 1]),
-                                               original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]),
+                                               original_row_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_edge_ids=tensor([0]),
+                                               original_column_node_ids=tensor([3, 4, 0, 5, 1]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2, 3], dtype=torch.int32),
-                                                                         indices=tensor([4, 4, 0], dtype=torch.int32),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 0, 0, 1, 2], dtype=torch.int32),
+                                                                         indices=tensor([3, 3], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([3, 4, 0, 1, 5, 2]),
-                                               original_edge_ids=tensor([0, 8, 1]),
-                                               original_column_node_ids=tensor([3, 4, 0, 1, 5, 2]),
+                                               original_row_node_ids=tensor([3, 4, 0, 5, 1]),
+                                               original_edge_ids=tensor([8, 0]),
+                                               original_column_node_ids=tensor([3, 4, 0, 5, 1]),
                             )],
           node_features={'feat': tensor([[0.8672, 0.2276],
                                 [0.5503, 0.8223],
                                 [0.9634, 0.2294],
-                                [0.6172, 0.7865],
                                 [0.5160, 0.2486],
-                                [0.2109, 0.1089]])},
+                                [0.6172, 0.7865]])},
           labels=tensor([1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
-          input_nodes=tensor([3, 4, 0, 1, 5, 2]),
+          input_nodes=tensor([3, 4, 0, 5, 1]),
           indexes=tensor([0, 1, 2, 3, 0, 0, 1, 1, 2, 2, 3, 3]),
-          edge_features=[{'feat': tensor([[0.5123, 0.1709, 0.6150],
-                                [0.1476, 0.1902, 0.1314]])},
-                        {'feat': tensor([[0.5123, 0.1709, 0.6150],
-                                [0.8972, 0.7511, 0.3617],
-                                [0.1476, 0.1902, 0.1314]])}],
+          edge_features=[{'feat': tensor([[0.5123, 0.1709, 0.6150]])},
+                        {'feat': tensor([[0.8972, 0.7511, 0.3617],
+                                [0.5123, 0.1709, 0.6150]])}],
           compacted_seeds=tensor([[0, 0],
                                   [1, 0],
                                   [1, 1],
                                   [2, 1],
+                                  [0, 1],
                                   [0, 3],
-                                  [0, 4],
-                                  [1, 5],
                                   [1, 4],
                                   [1, 1],
-                                  [1, 0],
-                                  [2, 3],
-                                  [2, 4]]),
-          blocks=[Block(num_src_nodes=6, num_dst_nodes=6, num_edges=2),
-                 Block(num_src_nodes=6, num_dst_nodes=6, num_edges=3)],
+                                  [1, 1],
+                                  [1, 3],
+                                  [2, 4],
+                                  [2, 0]]),
+          blocks=[Block(num_src_nodes=5, num_dst_nodes=5, num_edges=1),
+                 Block(num_src_nodes=5, num_dst_nodes=5, num_edges=2)],
        )"""
         ),
         str(
             """MiniBatch(seeds=tensor([[5, 5],
                         [4, 5],
-                        [5, 0],
-                        [5, 4],
+                        [5, 5],
+                        [5, 5],
                         [4, 0],
-                        [4, 1]]),
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([1, 0], dtype=torch.int32),
+                        [4, 0]]),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1], dtype=torch.int32),
+                                                                         indices=tensor([1], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([5, 4, 0, 1]),
-                                               original_edge_ids=tensor([6, 0]),
-                                               original_column_node_ids=tensor([5, 4, 0, 1]),
+                                               original_row_node_ids=tensor([5, 4, 0]),
+                                               original_edge_ids=tensor([6]),
+                                               original_column_node_ids=tensor([5, 4, 0]),
                             ),
-                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([1, 0], dtype=torch.int32),
+                            SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 0, 1, 1], dtype=torch.int32),
+                                                                         indices=tensor([2], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([5, 4, 0, 1]),
-                                               original_edge_ids=tensor([6, 0]),
-                                               original_column_node_ids=tensor([5, 4, 0, 1]),
+                                               original_row_node_ids=tensor([5, 4, 0]),
+                                               original_edge_ids=tensor([7]),
+                                               original_column_node_ids=tensor([5, 4, 0]),
                             )],
           node_features={'feat': tensor([[0.5160, 0.2486],
                                 [0.5503, 0.8223],
-                                [0.9634, 0.2294],
-                                [0.6172, 0.7865]])},
+                                [0.9634, 0.2294]])},
           labels=tensor([1., 1., 0., 0., 0., 0.]),
-          input_nodes=tensor([5, 4, 0, 1]),
+          input_nodes=tensor([5, 4, 0]),
           indexes=tensor([0, 1, 0, 0, 1, 1]),
-          edge_features=[{'feat': tensor([[0.4088, 0.8200, 0.1851],
-                                [0.5123, 0.1709, 0.6150]])},
-                        {'feat': tensor([[0.4088, 0.8200, 0.1851],
-                                [0.5123, 0.1709, 0.6150]])}],
+          edge_features=[{'feat': tensor([[0.4088, 0.8200, 0.1851]])},
+                        {'feat': tensor([[0.0056, 0.9469, 0.4432]])}],
           compacted_seeds=tensor([[0, 0],
                                   [1, 0],
-                                  [0, 2],
-                                  [0, 1],
+                                  [0, 0],
+                                  [0, 0],
                                   [1, 2],
-                                  [1, 3]]),
-          blocks=[Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2),
-                 Block(num_src_nodes=4, num_dst_nodes=4, num_edges=2)],
+                                  [1, 2]]),
+          blocks=[Block(num_src_nodes=3, num_dst_nodes=3, num_edges=1),
+                 Block(num_src_nodes=3, num_dst_nodes=3, num_edges=1)],
        )"""
         ),
     ]
     for step, data in enumerate(dataloader):
-        assert expected[step] == str(data), print(data)
+        assert expected[step] == str(data), print(step, data)
 
 
 def test_integration_node_classification():
@@ -275,10 +268,10 @@ def test_integration_node_classification():
         str(
             """MiniBatch(seeds=tensor([5, 1]),
           sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([2, 0], dtype=torch.int32),
+                                                                         indices=tensor([0, 0], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([5, 1, 4]),
-                                               original_edge_ids=tensor([9, 0]),
+                                               original_row_node_ids=tensor([5, 1]),
+                                               original_edge_ids=tensor([8, 0]),
                                                original_column_node_ids=tensor([5, 1]),
                             ),
                             SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 2], dtype=torch.int32),
@@ -289,51 +282,49 @@ def test_integration_node_classification():
                                                original_column_node_ids=tensor([5, 1]),
                             )],
           node_features={'feat': tensor([[0.5160, 0.2486],
-                                [0.6172, 0.7865],
-                                [0.5503, 0.8223]])},
+                                [0.6172, 0.7865]])},
           labels=None,
-          input_nodes=tensor([5, 1, 4]),
+          input_nodes=tensor([5, 1]),
           indexes=None,
-          edge_features=[{'feat': tensor([[0.5773, 0.2199, 0.3366],
+          edge_features=[{'feat': tensor([[0.8972, 0.7511, 0.3617],
                                 [0.5123, 0.1709, 0.6150]])},
                         {'feat': tensor([[0.8972, 0.7511, 0.3617],
                                 [0.5123, 0.1709, 0.6150]])}],
           compacted_seeds=None,
-          blocks=[Block(num_src_nodes=3, num_dst_nodes=2, num_edges=2),
+          blocks=[Block(num_src_nodes=2, num_dst_nodes=2, num_edges=2),
                  Block(num_src_nodes=2, num_dst_nodes=2, num_edges=2)],
        )"""
         ),
         str(
             """MiniBatch(seeds=tensor([2, 4]),
-          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 2, 3, 3], dtype=torch.int32),
+          sampled_subgraphs=[SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 2, 3], dtype=torch.int32),
                                                                          indices=tensor([2, 1, 2], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([2, 4, 3, 0]),
-                                               original_edge_ids=tensor([2, 6, 3]),
-                                               original_column_node_ids=tensor([2, 4, 3, 0]),
+                                               original_row_node_ids=tensor([2, 4, 3]),
+                                               original_edge_ids=tensor([1, 6, 3]),
+                                               original_column_node_ids=tensor([2, 4, 3]),
                             ),
                             SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 2], dtype=torch.int32),
-                                                                         indices=tensor([2, 3], dtype=torch.int32),
+                                                                         indices=tensor([2, 1], dtype=torch.int32),
                                                            ),
-                                               original_row_node_ids=tensor([2, 4, 3, 0]),
-                                               original_edge_ids=tensor([2, 7]),
+                                               original_row_node_ids=tensor([2, 4, 3]),
+                                               original_edge_ids=tensor([2, 6]),
                                                original_column_node_ids=tensor([2, 4]),
                             )],
           node_features={'feat': tensor([[0.2109, 0.1089],
                                 [0.5503, 0.8223],
-                                [0.8672, 0.2276],
-                                [0.9634, 0.2294]])},
+                                [0.8672, 0.2276]])},
           labels=None,
-          input_nodes=tensor([2, 4, 3, 0]),
+          input_nodes=tensor([2, 4, 3]),
           indexes=None,
-          edge_features=[{'feat': tensor([[0.2582, 0.5203, 0.6228],
+          edge_features=[{'feat': tensor([[0.1476, 0.1902, 0.1314],
                                 [0.4088, 0.8200, 0.1851],
                                 [0.3708, 0.7631, 0.2683]])},
                         {'feat': tensor([[0.2582, 0.5203, 0.6228],
-                                [0.0056, 0.9469, 0.4432]])}],
+                                [0.4088, 0.8200, 0.1851]])}],
           compacted_seeds=None,
-          blocks=[Block(num_src_nodes=4, num_dst_nodes=4, num_edges=3),
-                 Block(num_src_nodes=4, num_dst_nodes=2, num_edges=2)],
+          blocks=[Block(num_src_nodes=3, num_dst_nodes=3, num_edges=3),
+                 Block(num_src_nodes=3, num_dst_nodes=2, num_edges=2)],
        )"""
         ),
         str(
@@ -342,14 +333,14 @@ def test_integration_node_classification():
                                                                          indices=tensor([0], dtype=torch.int32),
                                                            ),
                                                original_row_node_ids=tensor([3, 0]),
-                                               original_edge_ids=tensor([4]),
+                                               original_edge_ids=tensor([3]),
                                                original_column_node_ids=tensor([3, 0]),
                             ),
                             SampledSubgraphImpl(sampled_csc=CSCFormatBase(indptr=tensor([0, 1, 1], dtype=torch.int32),
                                                                          indices=tensor([0], dtype=torch.int32),
                                                            ),
                                                original_row_node_ids=tensor([3, 0]),
-                                               original_edge_ids=tensor([4]),
+                                               original_edge_ids=tensor([3]),
                                                original_column_node_ids=tensor([3, 0]),
                             )],
           node_features={'feat': tensor([[0.8672, 0.2276],
@@ -357,8 +348,8 @@ def test_integration_node_classification():
           labels=None,
           input_nodes=tensor([3, 0]),
           indexes=None,
-          edge_features=[{'feat': tensor([[0.2126, 0.7878, 0.7225]])},
-                        {'feat': tensor([[0.2126, 0.7878, 0.7225]])}],
+          edge_features=[{'feat': tensor([[0.3708, 0.7631, 0.2683]])},
+                        {'feat': tensor([[0.3708, 0.7631, 0.2683]])}],
           compacted_seeds=None,
           blocks=[Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1),
                  Block(num_src_nodes=2, num_dst_nodes=2, num_edges=1)],

From f37f24c77c0e8c7e1792af40d7e0245f36395d31 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 23 Aug 2024 14:43:01 -0400
Subject: [PATCH 46/78] [GraphBolt] Do not expose `CPUFeatureCache` storage
 tensor. (#7735)

---
 graphbolt/src/feature_cache.h                  | 5 +++++
 graphbolt/src/python_binding.cc                | 3 ++-
 python/dgl/graphbolt/impl/cpu_feature_cache.py | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/graphbolt/src/feature_cache.h b/graphbolt/src/feature_cache.h
index 43e98c1c615f..882e6f9b7360 100644
--- a/graphbolt/src/feature_cache.h
+++ b/graphbolt/src/feature_cache.h
@@ -41,6 +41,10 @@ struct FeatureCache : public torch::CustomClassHolder {
       const std::vector<int64_t>& shape, torch::ScalarType dtype,
       bool pin_memory);
 
+  bool IsPinned() const { return tensor_.is_pinned(); }
+
+  int64_t NumBytes() const { return tensor_.numel() * tensor_.element_size(); }
+
   /**
    * @brief The cache query function. Allocates an empty tensor `values` with
    * size as the first dimension and runs
@@ -87,6 +91,7 @@ struct FeatureCache : public torch::CustomClassHolder {
       const std::vector<int64_t>& shape, torch::ScalarType dtype,
       bool pin_memory);
 
+ private:
   torch::Tensor tensor_;
 };
 
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 85bf17ed7024..4df395b0f904 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -167,7 +167,8 @@ TORCH_LIBRARY(graphbolt, m) {
       "clock_cache_policy",
       &storage::PartitionedCachePolicy::Create<storage::ClockCachePolicy>);
   m.class_<storage::FeatureCache>("FeatureCache")
-      .def_readonly("tensor", &storage::FeatureCache::tensor_)
+      .def("is_pinned", &storage::FeatureCache::IsPinned)
+      .def_property("nbytes", &storage::FeatureCache::NumBytes)
       .def("index_select", &storage::FeatureCache::IndexSelect)
       .def("query", &storage::FeatureCache::Query)
       .def("query_async", &storage::FeatureCache::QueryAsync)
diff --git a/python/dgl/graphbolt/impl/cpu_feature_cache.py b/python/dgl/graphbolt/impl/cpu_feature_cache.py
index 74e054033bb0..1bcc234526e8 100644
--- a/python/dgl/graphbolt/impl/cpu_feature_cache.py
+++ b/python/dgl/graphbolt/impl/cpu_feature_cache.py
@@ -61,12 +61,12 @@ def __init__(
 
     def is_pinned(self):
         """Returns True if the cache storage is pinned."""
-        return self._cache.tensor.is_pinned()
+        return self._cache.is_pinned()
 
     @property
     def max_size_in_bytes(self):
         """Return the size taken by the cache in bytes."""
-        return self._cache.tensor.nbytes
+        return self._cache.nbytes
 
     def query(self, keys, offset=0):
         """Queries the cache.

From 9514e7b9cdc9177d411d972a5a9ae5263b8f3de0 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 23 Aug 2024 14:55:14 -0400
Subject: [PATCH 47/78] [GraphBolt] Minor improvement to item sampler and cache
 policy. (#7734)

---
 graphbolt/src/cache_policy.h         | 2 +-
 python/dgl/graphbolt/item_sampler.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/graphbolt/src/cache_policy.h b/graphbolt/src/cache_policy.h
index 6baa2ddf068b..458b56b1b5f7 100644
--- a/graphbolt/src/cache_policy.h
+++ b/graphbolt/src/cache_policy.h
@@ -238,7 +238,7 @@ class BaseCachePolicy {
     // Move the element to the beginning of the queue.
     to.splice(to.begin(), temp);
     // The iterators and references are not invalidated.
-    // TORCH_CHECK(it == to.begin());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(it == to.begin());
   }
 
   int64_t capacity_;
diff --git a/python/dgl/graphbolt/item_sampler.py b/python/dgl/graphbolt/item_sampler.py
index 3f2efa920c01..1ce1372b2dee 100644
--- a/python/dgl/graphbolt/item_sampler.py
+++ b/python/dgl/graphbolt/item_sampler.py
@@ -330,12 +330,11 @@ def __iter__(self) -> Iterator:
             self._drop_uneven_inputs,
         )
         if self._shuffle:
-            g = torch.Generator()
-            g.manual_seed(self._seed + self._epoch)
+            g = torch.Generator().manual_seed(self._seed + self._epoch)
             permutation = torch.randperm(total, generator=g)
+            indices = permutation[start_offset : start_offset + assigned_count]
         else:
-            permutation = torch.arange(total)
-        indices = permutation[start_offset : start_offset + assigned_count]
+            indices = torch.arange(start_offset, start_offset + assigned_count)
         for i in range(0, assigned_count, self._batch_size):
             if output_count <= 0:
                 break

From ea940f72b5f1005d768707ecbc79354513deffae Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 23 Aug 2024 15:43:12 -0400
Subject: [PATCH 48/78] [GraphBolt][CUDA] Destroy dist group at the end of
 script. (#7736)

---
 examples/multigpu/graphbolt/node_classification.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
index f4d1e40f0d6a..d074e1feddcd 100644
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -364,6 +364,7 @@ def run(rank, world_size, args, devices, dataset):
 
     if rank == 0:
         print(f"Test Accuracy {test_acc.item():.4f}")
+    dist.destroy_process_group()
 
 
 def parse_args():

From d68029a60544e4e1cd72adb8b4436a909e2043c6 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 25 Aug 2024 15:54:52 -0400
Subject: [PATCH 49/78] [GraphBolt][Doc] Add `numpy_save_aligned` to
 documentation. (#7739)

---
 docs/source/api/python/dgl.graphbolt.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/api/python/dgl.graphbolt.rst b/docs/source/api/python/dgl.graphbolt.rst
index 98e9cc71552d..72315cdc0aa6 100644
--- a/docs/source/api/python/dgl.graphbolt.rst
+++ b/docs/source/api/python/dgl.graphbolt.rst
@@ -200,4 +200,4 @@ Utilities
     compact_csc_format
     unique_and_compact
     unique_and_compact_csc_formats
-
+    numpy_save_aligned

From ffe3bb22e85b06dded4e44a155b63b19d4f73026 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sun, 25 Aug 2024 17:05:29 -0400
Subject: [PATCH 50/78] [GraphBolt][Doc] Fix `read_async` doc display issue.
 (#7741)

---
 python/dgl/graphbolt/impl/cpu_cached_feature.py        | 2 +-
 python/dgl/graphbolt/impl/gpu_cached_feature.py        | 2 +-
 python/dgl/graphbolt/impl/torch_based_feature_store.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 80fd2974671e..184c382fc3c7 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -81,7 +81,7 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            ``read_async_num_stages(ids.device)``th invocation. The return result
+            ``read_async_num_stages(ids.device)``\ th invocation. The return result
             can be accessed by calling ``.wait()``. on the returned future object.
             It is undefined behavior to call ``.wait()`` more than once.
 
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index c676d890f118..7c7b01cb38ef 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -113,7 +113,7 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            ``read_async_num_stages(ids.device)``th invocation. The return result
+            ``read_async_num_stages(ids.device)``\ th invocation. The return result
             can be accessed by calling ``.wait()``. on the returned future object.
             It is undefined behavior to call ``.wait()`` more than once.
 
diff --git a/python/dgl/graphbolt/impl/torch_based_feature_store.py b/python/dgl/graphbolt/impl/torch_based_feature_store.py
index 42d5e7a859f1..c1739415f3e2 100644
--- a/python/dgl/graphbolt/impl/torch_based_feature_store.py
+++ b/python/dgl/graphbolt/impl/torch_based_feature_store.py
@@ -151,7 +151,7 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            ``read_async_num_stages(ids.device)``th invocation. The return result
+            ``read_async_num_stages(ids.device)``\ th invocation. The return result
             can be accessed by calling ``.wait()``. on the returned future object.
             It is undefined behavior to call ``.wait()`` more than once.
 
@@ -434,7 +434,7 @@ def read_async(self, ids: torch.Tensor):
         -------
         A generator object.
             The returned generator object returns a future on
-            ``read_async_num_stages(ids.device)``th invocation. The return result
+            ``read_async_num_stages(ids.device)``\ th invocation. The return result
             can be accessed by calling ``.wait()``. on the returned future object.
             It is undefined behavior to call ``.wait()`` more than once.
 

From 033100932d9f92e4d3edaf581afb369a47085538 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 26 Aug 2024 01:26:26 -0400
Subject: [PATCH 51/78] [GraphBolt][Doc] Fix `[gpu|cpu]_cached_feature`
 rendering. (#7743)

---
 docs/source/api/python/dgl.graphbolt.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/api/python/dgl.graphbolt.rst b/docs/source/api/python/dgl.graphbolt.rst
index 72315cdc0aa6..c13f78606c80 100644
--- a/docs/source/api/python/dgl.graphbolt.rst
+++ b/docs/source/api/python/dgl.graphbolt.rst
@@ -56,9 +56,7 @@ collection of features.
     TorchBasedFeature
     TorchBasedFeatureStore
     DiskBasedFeature
-    cpu_cached_feature
     CPUCachedFeature
-    gpu_cached_feature
     GPUCachedFeature
 
 
@@ -186,6 +184,8 @@ Utilities
     :toctree: ../../generated/
     :nosignatures:
 
+    cpu_cached_feature
+    gpu_cached_feature
     fused_csc_sampling_graph
     load_from_shared_memory
     from_dglgraph
@@ -195,6 +195,7 @@ Utilities
     seed
     index_select
     expand_indptr
+    indptr_edge_ids
     add_reverse_edges
     exclude_seed_edges
     compact_csc_format

From 8eccbfa283cf45b5f026ba8213848575d34ab54d Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 26 Aug 2024 18:53:51 -0400
Subject: [PATCH 52/78] [GraphBolt][PyG] Add `to_pyg` for layer input
 conversion. (#7745)

---
 python/dgl/graphbolt/sampled_subgraph.py      |  80 +++++++++++++-
 .../impl/test_sampled_subgraph_impl.py        | 100 ++++++++++++++++++
 2 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/sampled_subgraph.py b/python/dgl/graphbolt/sampled_subgraph.py
index d46535115170..8bff77de90b3 100644
--- a/python/dgl/graphbolt/sampled_subgraph.py
+++ b/python/dgl/graphbolt/sampled_subgraph.py
@@ -1,7 +1,7 @@
 """Graphbolt sampled subgraph."""
 
 # pylint: disable= invalid-name
-from typing import Dict, Tuple, Union
+from typing import Dict, NamedTuple, Tuple, Union
 
 import torch
 
@@ -20,6 +20,28 @@
 __all__ = ["SampledSubgraph"]
 
 
+class PyGLayerData(NamedTuple):
+    """A named tuple class to represent homogenous inputs to a PyG model layer.
+    The fields are x (input features), edge_index and size
+    (source and destination sizes).
+    """
+
+    x: torch.Tensor
+    edge_index: torch.Tensor
+    size: Tuple[int, int]
+
+
+class PyGLayerHeteroData(NamedTuple):
+    """A named tuple class to represent heterogenous inputs to a PyG model
+    layer. The fields are x (input features), edge_index and size
+    (source and destination sizes), and all fields are dictionaries.
+    """
+
+    x: Dict[str, torch.Tensor]
+    edge_index: Dict[str, torch.Tensor]
+    size: Dict[str, Tuple[int, int]]
+
+
 class SampledSubgraph:
     r"""An abstract class for sampled subgraph. In the context of a
     heterogeneous graph, each field should be of `Dict` type. Otherwise,
@@ -233,6 +255,62 @@ def exclude_edges(
                 )
             return calling_class(*_slice_subgraph(self, index))
 
+    def to_pyg(
+        self, x: Union[torch.Tensor, Dict[str, torch.Tensor]]
+    ) -> Union[PyGLayerData, PyGLayerHeteroData]:
+        """
+        Process layer inputs so that they can be consumed by a PyG model layer.
+
+        Parameters
+        ----------
+        x : Union[torch.Tensor, Dict[str, torch.Tensor]]
+            The input node features to the GNN layer.
+
+        Returns
+        -------
+        Union[PyGLayerData, PyGLayerHeteroData]
+            A named tuple class with `x`, `edge_index` and `size` fields.
+            Typically, a PyG GNN layer's forward method will accept these as
+            arguments.
+        """
+        if isinstance(x, torch.Tensor):
+            # Homogenous
+            src = self.sampled_csc.indices
+            dst = expand_indptr(
+                self.sampled_csc.indptr,
+                dtype=src.dtype,
+                output_size=src.size(0),
+            )
+            edge_index = torch.stack([src, dst], dim=0).long()
+            dst_size = self.sampled_csc.indptr.size(0) - 1
+            # h and h[:dst_size] correspond to source and destination features resp.
+            return PyGLayerData(
+                (x, x[:dst_size]), edge_index, (x.size(0), dst_size)
+            )
+        else:
+            # Heterogenous
+            x_dst_dict = {}
+            edge_index_dict = {}
+            sizes_dict = {}
+            for etype, sampled_csc in self.sampled_csc.items():
+                src = sampled_csc.indices
+                dst = expand_indptr(
+                    sampled_csc.indptr,
+                    dtype=src.dtype,
+                    output_size=src.size(0),
+                )
+                edge_index = torch.stack([src, dst], dim=0).long()
+                dst_size = sampled_csc.indptr.size(0) - 1
+                # h and h[:dst_size] correspond to source and destination features resp.
+                src_ntype, _, dst_ntype = etype_str_to_tuple(etype)
+                x_dst_dict[dst_ntype] = x[dst_ntype][:dst_size]
+                edge_index_dict[etype] = edge_index
+                sizes_dict[etype] = (x[src_ntype].size(0), dst_size)
+
+            return PyGLayerHeteroData(
+                (x, x_dst_dict), edge_index_dict, sizes_dict
+            )
+
     def to(
         self, device: torch.device, non_blocking=False
     ) -> None:  # pylint: disable=invalid-name
diff --git a/tests/python/pytorch/graphbolt/impl/test_sampled_subgraph_impl.py b/tests/python/pytorch/graphbolt/impl/test_sampled_subgraph_impl.py
index f6709405591c..e7676f27756d 100644
--- a/tests/python/pytorch/graphbolt/impl/test_sampled_subgraph_impl.py
+++ b/tests/python/pytorch/graphbolt/impl/test_sampled_subgraph_impl.py
@@ -2,6 +2,7 @@
 
 import backend as F
 
+import dgl
 import dgl.graphbolt as gb
 import pytest
 import torch
@@ -505,6 +506,105 @@ def test_exclude_edges_hetero_duplicated_tensor(reverse_row, reverse_column):
     _assert_container_equal(result.original_edge_ids, expected_edge_ids)
 
 
+def test_to_pyg_homo():
+    graph = dgl.graph(([5, 0, 7, 7, 2, 4], [0, 1, 2, 2, 3, 4]))
+    graph = gb.from_dglgraph(graph, is_homogeneous=True).to(F.ctx())
+    items = torch.LongTensor([[0, 3], [4, 4]])
+    names = "seeds"
+    itemset = gb.ItemSet(items, names=names)
+    datapipe = gb.ItemSampler(itemset, batch_size=4).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([-1]) for _ in range(num_layer)]
+    sampler = gb.NeighborSampler
+    datapipe = sampler(
+        datapipe,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+    for minibatch in datapipe:
+        x = torch.randn((minibatch.node_ids().size(0), 2), dtype=torch.float32)
+        for subgraph in minibatch.sampled_subgraphs:
+            (x_src, x_dst), edge_index, sizes = subgraph.to_pyg(x)
+            assert torch.equal(x_src, x)
+            dst_size = subgraph.original_column_node_ids.size(0)
+            assert torch.equal(x_dst, x[:dst_size])
+            src_size = subgraph.original_row_node_ids.size(0)
+            assert dst_size == sizes[1]
+            assert src_size == sizes[0]
+            assert torch.equal(edge_index[0], subgraph.sampled_csc.indices)
+            assert torch.equal(
+                edge_index[1],
+                gb.expand_indptr(
+                    subgraph.sampled_csc.indptr,
+                    subgraph.sampled_csc.indices.dtype,
+                ),
+            )
+            x = x_dst
+
+
+def test_to_pyg_hetero():
+    # COO graph:
+    # [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
+    # [2, 4, 2, 3, 0, 1, 1, 0, 0, 1]
+    # [1, 1, 1, 1, 0, 0, 0, 0, 0] - > edge type.
+    # num_nodes = 5, num_n1 = 2, num_n2 = 3
+    ntypes = {"n1": 0, "n2": 1}
+    etypes = {"n1:e1:n2": 0, "n2:e2:n1": 1}
+    indptr = torch.LongTensor([0, 2, 4, 6, 8, 10])
+    indices = torch.LongTensor([2, 4, 2, 3, 0, 1, 1, 0, 0, 1])
+    type_per_edge = torch.LongTensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])
+    node_type_offset = torch.LongTensor([0, 2, 5])
+    graph = gb.fused_csc_sampling_graph(
+        indptr,
+        indices,
+        node_type_offset=node_type_offset,
+        type_per_edge=type_per_edge,
+        node_type_to_id=ntypes,
+        edge_type_to_id=etypes,
+    ).to(F.ctx())
+    itemset = gb.HeteroItemSet(
+        {"n1:e1:n2": gb.ItemSet(torch.tensor([[0, 1]]), names="seeds")}
+    )
+    item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
+    num_layer = 2
+    fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
+    Sampler = gb.NeighborSampler
+    datapipe = Sampler(
+        item_sampler,
+        graph,
+        fanouts,
+        deduplicate=True,
+    )
+    for minibatch in datapipe:
+        x = {}
+        for key, ids in minibatch.node_ids().items():
+            x[key] = torch.randn((ids.size(0), 2), dtype=torch.float32)
+        for subgraph in minibatch.sampled_subgraphs:
+            (x_src, x_dst), edge_index, sizes = subgraph.to_pyg(x)
+            assert x_src == x
+            for ntype in x:
+                dst_size = subgraph.original_column_node_ids[ntype].size(0)
+                assert torch.equal(x_dst[ntype], x[ntype][:dst_size])
+            for etype in subgraph.sampled_csc:
+                src_ntype, _, dst_ntype = gb.etype_str_to_tuple(etype)
+                src_size = subgraph.original_row_node_ids[src_ntype].size(0)
+                dst_size = subgraph.original_column_node_ids[dst_ntype].size(0)
+                assert dst_size == sizes[etype][1]
+                assert src_size == sizes[etype][0]
+                assert torch.equal(
+                    edge_index[etype][0], subgraph.sampled_csc[etype].indices
+                )
+                assert torch.equal(
+                    edge_index[etype][1],
+                    gb.expand_indptr(
+                        subgraph.sampled_csc[etype].indptr,
+                        subgraph.sampled_csc[etype].indices.dtype,
+                    ),
+                )
+            x = x_dst
+
+
 @unittest.skipIf(
     F._default_context_str == "cpu",
     reason="`to` function needs GPU to test.",

From 6bce0cdfc50190911f34fe8d7407d6ee528c6263 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 26 Aug 2024 19:36:53 -0400
Subject: [PATCH 53/78] [GraphBolt][PyG] Use `SampledSubgraph.to_pyg` in
 examples. (#7747)

---
 .../pyg/hetero/node_classification.py         | 32 ++-----------------
 .../pyg/labor/node_classification.py          | 25 ++-------------
 .../pyg/node_classification_advanced.py       | 25 ++-------------
 3 files changed, 8 insertions(+), 74 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 836b907941ef..032b84d82c4b 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -74,34 +74,6 @@ def create_dataloader(
     return gb.DataLoader(datapipe, num_workers=args.num_workers)
 
 
-def convert_to_pyg(h, subgraph):
-    #####################################################################
-    # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
-    #
-    #   We convert the provided sampled edges in CSC format from GraphBolt and
-    #   convert to COO via using gb.expand_indptr.
-    #####################################################################
-    h_dst_dict = {}
-    edge_index_dict = {}
-    sizes_dict = {}
-    for etype, sampled_csc in subgraph.sampled_csc.items():
-        src = sampled_csc.indices
-        dst = gb.expand_indptr(
-            sampled_csc.indptr,
-            dtype=src.dtype,
-            output_size=src.size(0),
-        )
-        edge_index = torch.stack([src, dst], dim=0).long()
-        dst_size = sampled_csc.indptr.size(0) - 1
-        # h and h[:dst_size] correspond to source and destination features resp.
-        src_ntype, _, dst_ntype = gb.etype_str_to_tuple(etype)
-        h_dst_dict[dst_ntype] = h[dst_ntype][:dst_size]
-        edge_index_dict[etype] = edge_index
-        sizes_dict[etype] = (h[src_ntype].size(0), dst_size)
-
-    return (h, h_dst_dict), edge_index_dict, sizes_dict
-
-
 class RelGraphConvLayer(nn.Module):
     def __init__(
         self,
@@ -153,7 +125,7 @@ def forward(self, subgraph, x):
         # only on the destination nodes' features. By doing so, we ensure the
         # feature dimensions match and prevent any misuse of incorrect node
         # features.
-        (h, h_dst), edge_index, size = convert_to_pyg(x, subgraph)
+        (h, h_dst), edge_index, size = subgraph.to_pyg(x)
 
         h_out = {}
         for etype in edge_index:
@@ -514,7 +486,7 @@ def main():
 
     # Initialize the entity classification model.
     model = EntityClassify(
-        graph, feat_size, hidden_channels, num_classes, 3
+        graph, feat_size, hidden_channels, num_classes, len(args.fanout)
     ).to(args.device)
 
     print(
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index 7cf0834b7cc8..f6636d2721b7 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -30,25 +30,6 @@ def accuracy(out, labels):
     return (labels == predictions).sum(dtype=torch.float64) / labels.size(0)
 
 
-def convert_to_pyg(h, subgraph):
-    #####################################################################
-    # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
-    #
-    #   We convert the provided sampled edges in CSC format from GraphBolt and
-    #   convert to COO via using gb.expand_indptr.
-    #####################################################################
-    src = subgraph.sampled_csc.indices
-    dst = gb.expand_indptr(
-        subgraph.sampled_csc.indptr,
-        dtype=src.dtype,
-        output_size=src.size(0),
-    )
-    edge_index = torch.stack([src, dst], dim=0).long()
-    dst_size = subgraph.sampled_csc.indptr.size(0) - 1
-    # h and h[:dst_size] correspond to source and destination features resp.
-    return (h, h[:dst_size]), edge_index, (h.size(0), dst_size)
-
-
 class GraphSAGE(torch.nn.Module):
     def __init__(
         self, in_size, hidden_size, out_size, n_layers, dropout, variant
@@ -75,7 +56,7 @@ def __init__(
     def forward(self, subgraphs, x):
         h = x
         for i, (layer, subgraph) in enumerate(zip(self.layers, subgraphs)):
-            h, edge_index, size = convert_to_pyg(h, subgraph)
+            h, edge_index, size = subgraph.to_pyg(h)
             h = layer(h, edge_index, size=size)
             if self.variant == "custom":
                 h = self.activation(h)
@@ -101,8 +82,8 @@ def inference(self, graph, features, dataloader, storage_device):
             )
             for data in tqdm(dataloader, "Inferencing"):
                 # len(data.sampled_subgraphs) = 1
-                h, edge_index, size = convert_to_pyg(
-                    data.node_features["feat"], data.sampled_subgraphs[0]
+                h, edge_index, size = data.sampled_subgraphs[0].to_pyg(
+                    data.node_features["feat"]
                 )
                 hidden_x = layer(h, edge_index, size=size)
                 if self.variant == "custom":
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index e55da747db62..02df19f6fd38 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -75,25 +75,6 @@ def accuracy(out, labels):
     return (labels == predictions).sum(dtype=torch.float64) / labels.size(0)
 
 
-def convert_to_pyg(h, subgraph):
-    #####################################################################
-    # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
-    #
-    #   We convert the provided sampled edges in CSC format from GraphBolt and
-    #   convert to COO via using gb.expand_indptr.
-    #####################################################################
-    src = subgraph.sampled_csc.indices
-    dst = gb.expand_indptr(
-        subgraph.sampled_csc.indptr,
-        dtype=src.dtype,
-        output_size=src.size(0),
-    )
-    edge_index = torch.stack([src, dst], dim=0).long()
-    dst_size = subgraph.sampled_csc.indptr.size(0) - 1
-    # h and h[:dst_size] correspond to source and destination features resp.
-    return (h, h[:dst_size]), edge_index, (h.size(0), dst_size)
-
-
 class GraphSAGE(torch.nn.Module):
     #####################################################################
     # (HIGHLIGHT) Define the GraphSAGE model architecture.
@@ -123,7 +104,7 @@ def forward(self, subgraphs, x):
             #   given features to get src and dst features to use the PyG layers
             #   in the more efficient bipartite mode.
             #####################################################################
-            h, edge_index, size = convert_to_pyg(h, subgraph)
+            h, edge_index, size = subgraph.to_pyg(h)
             h = layer(h, edge_index, size=size)
             if i != len(subgraphs) - 1:
                 h = F.relu(h)
@@ -146,8 +127,8 @@ def inference(self, graph, features, dataloader, storage_device):
             )
             for data in tqdm(dataloader, "Inferencing"):
                 # len(data.sampled_subgraphs) = 1
-                h, edge_index, size = convert_to_pyg(
-                    data.node_features["feat"], data.sampled_subgraphs[0]
+                h, edge_index, size = data.sampled_subgraphs[0].to_pyg(
+                    data.node_features["feat"]
                 )
                 hidden_x = layer(h, edge_index, size=size)
                 if not is_last_layer:

From f38790bfba761ffabc506780bc2d025da48d9388 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 26 Aug 2024 20:28:48 -0400
Subject: [PATCH 54/78] [GraphBolt][CUDA] Fix error when empty env variable is
 used. (#7748)

---
 python/dgl/graphbolt/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
index 398e1367acb4..006b91d3851b 100644
--- a/python/dgl/graphbolt/__init__.py
+++ b/python/dgl/graphbolt/__init__.py
@@ -15,7 +15,7 @@
 WARNING_STR_TO_BE_SHOWN = None
 configs = (
     {}
-    if cuda_allocator_env is None
+    if cuda_allocator_env is None or len(cuda_allocator_env) == 0
     else {
         kv_pair.split(":")[0]: kv_pair.split(":")[1]
         for kv_pair in cuda_allocator_env.split(",")

From 25e7b1b7b10c28b386c6d656c0a739db1ae02cb4 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Tue, 27 Aug 2024 03:57:38 -0700
Subject: [PATCH 55/78] [GRAPHBOLT][WARNINGS]Fixing warnings appearing in some
 `graphbolt` tests. (#7582)

---
 python/dgl/graphbolt/itemset.py                | 6 +++---
 tests/python/pytorch/graphbolt/test_itemset.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/dgl/graphbolt/itemset.py b/python/dgl/graphbolt/itemset.py
index eb378e722a47..9df604dcf679 100644
--- a/python/dgl/graphbolt/itemset.py
+++ b/python/dgl/graphbolt/itemset.py
@@ -177,12 +177,12 @@ def __getitem__(self, index: Union[int, slice, Iterable[int]]):
                         f"{type(self).__name__} index out of range."
                     )
                 return torch.tensor(index, dtype=dtype)
-            elif isinstance(index, Iterable):
-                return torch.tensor(index, dtype=dtype)
+            elif isinstance(index, torch.Tensor):
+                return index.to(dtype)
             else:
                 raise TypeError(
                     f"{type(self).__name__} indices must be int, slice, or "
-                    f"iterable of int, not {type(index)}."
+                    f"torch.Tensor, not {type(index)}."
                 )
         elif self._num_items == 1:
             return self._items[0][index]
diff --git a/tests/python/pytorch/graphbolt/test_itemset.py b/tests/python/pytorch/graphbolt/test_itemset.py
index 34c18b8a2b35..912c9a035f49 100644
--- a/tests/python/pytorch/graphbolt/test_itemset.py
+++ b/tests/python/pytorch/graphbolt/test_itemset.py
@@ -123,7 +123,7 @@ def test_ItemSet_seed_nodes():
     # Indexing with invalid input type.
     with pytest.raises(
         TypeError,
-        match="ItemSet indices must be int, slice, or iterable of int, not <class 'float'>.",
+        match="ItemSet indices must be int, slice, or torch.Tensor, not <class 'float'>.",
     ):
         _ = item_set[1.5]
 

From 240a49e68a6da5c79f558245e4a9f45fad66fc01 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 27 Aug 2024 14:15:32 -0400
Subject: [PATCH 56/78] [GraphBolt][Doc] Fix typo in documentation. (#7751)

---
 python/dgl/graphbolt/impl/cpu_cached_feature.py | 2 +-
 python/dgl/graphbolt/impl/gpu_cached_feature.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dgl/graphbolt/impl/cpu_cached_feature.py b/python/dgl/graphbolt/impl/cpu_cached_feature.py
index 184c382fc3c7..0cb437494b13 100644
--- a/python/dgl/graphbolt/impl/cpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/cpu_cached_feature.py
@@ -17,7 +17,7 @@
 
 
 class CPUCachedFeature(Feature):
-    r"""CPU cached feature wrapping a fallback feature. Use `cpu_feature_cache`
+    r"""CPU cached feature wrapping a fallback feature. Use `cpu_cached_feature`
     to construct an instance of this class.
 
     Parameters
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
index 7c7b01cb38ef..59a66413843c 100644
--- a/python/dgl/graphbolt/impl/gpu_cached_feature.py
+++ b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -18,7 +18,7 @@
 class GPUCachedFeature(Feature):
     r"""GPU cached feature wrapping a fallback feature. It uses the least
     recently used (LRU) algorithm as the cache eviction policy. Use
-    `gpu_feature_cache` to construct an instance of this class.
+    `gpu_cached_feature` to construct an instance of this class.
 
     Places the GPU cache to torch.cuda.current_device().
 

From b61d14c934c73cd806a3859c9db9cf3fdc540a74 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 27 Aug 2024 19:29:55 -0400
Subject: [PATCH 57/78] [GraphBolt][PyG] Link prediction example. (#7752)

---
 examples/graphbolt/pyg/link_prediction.py | 462 ++++++++++++++++++++++
 1 file changed, 462 insertions(+)
 create mode 100644 examples/graphbolt/pyg/link_prediction.py

diff --git a/examples/graphbolt/pyg/link_prediction.py b/examples/graphbolt/pyg/link_prediction.py
new file mode 100644
index 000000000000..4c2b05fd410e
--- /dev/null
+++ b/examples/graphbolt/pyg/link_prediction.py
@@ -0,0 +1,462 @@
+"""
+This script trains and tests a GraphSAGE model for link prediction on
+large graphs using graphbolt dataloader. It is the PyG counterpart of the
+example in `examples/graphbolt/link_prediction.py`.
+
+Paper: [Inductive Representation Learning on Large Graphs]
+(https://arxiv.org/abs/1706.02216)
+
+While node classification predicts labels for nodes based on their
+local neighborhoods, link prediction assesses the likelihood of an edge
+existing between two nodes, necessitating different sampling strategies
+that account for pairs of nodes and their joint neighborhoods.
+
+This flowchart describes the main functional sequence of the provided example.
+main
+│
+├───> OnDiskDataset pre-processing
+│
+├───> Instantiate SAGE model
+│
+├───> train
+│     │
+│     ├───> Get graphbolt dataloader (HIGHLIGHT)
+|     |
+|     |───> Define a PyG GNN model for link prediction (HIGHLIGHT)
+│     │
+│     └───> Training loop
+│           │
+│           ├───> SAGE.forward
+│
+└───> Validation and test set evaluation
+"""
+import argparse
+import time
+from functools import partial
+
+import dgl.graphbolt as gb
+import torch
+
+# For torch.compile until https://github.com/pytorch/pytorch/issues/121197 is
+# resolved.
+import torch._inductor.codecache
+
+torch._dynamo.config.cache_size_limit = 32
+
+import torch.nn.functional as F
+from torch_geometric.nn import SAGEConv
+from torchmetrics.retrieval import RetrievalMRR
+from tqdm import tqdm, trange
+
+
+class GraphSAGE(torch.nn.Module):
+    #####################################################################
+    # (HIGHLIGHT) Define the GraphSAGE model architecture.
+    #
+    # - This class inherits from `torch.nn.Module`.
+    # - Two convolutional layers are created using the SAGEConv class from PyG.
+    # - The forward method defines the computation performed at every call.
+    #####################################################################
+    def __init__(self, in_size, hidden_size, n_layers):
+        super(GraphSAGE, self).__init__()
+        self.layers = torch.nn.ModuleList()
+        sizes = [in_size] + [hidden_size] * n_layers
+        for i in range(n_layers):
+            self.layers.append(SAGEConv(sizes[i], sizes[i + 1]))
+        self.hidden_size = hidden_size
+        self.predictor = torch.nn.Sequential(
+            torch.nn.Linear(hidden_size, hidden_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_size, hidden_size),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_size, 1),
+        )
+
+    def forward(self, subgraphs, x):
+        h = x
+        for i, (layer, subgraph) in enumerate(zip(self.layers, subgraphs)):
+            #####################################################################
+            # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
+            #
+            #   PyG layers have two modes, bipartite and normal. We slice the
+            #   given features to get src and dst features to use the PyG layers
+            #   in the more efficient bipartite mode.
+            #####################################################################
+            h, edge_index, size = subgraph.to_pyg(h)
+            h = layer(h, edge_index, size=size)
+            if i != len(subgraphs) - 1:
+                h = F.relu(h)
+        return h
+
+    def inference(self, graph, features, dataloader, storage_device):
+        """Conduct layer-wise inference to get all the node embeddings."""
+        pin_memory = storage_device == "pinned"
+        buffer_device = torch.device("cpu" if pin_memory else storage_device)
+
+        for layer_idx, layer in enumerate(self.layers):
+            is_last_layer = layer_idx == len(self.layers) - 1
+
+            y = torch.empty(
+                graph.total_num_nodes,
+                self.hidden_size,
+                dtype=torch.float32,
+                device=buffer_device,
+                pin_memory=pin_memory,
+            )
+            for data in tqdm(dataloader, "Inferencing"):
+                # len(data.sampled_subgraphs) = 1
+                h, edge_index, size = data.sampled_subgraphs[0].to_pyg(
+                    data.node_features["feat"]
+                )
+                hidden_x = layer(h, edge_index, size=size)
+                if not is_last_layer:
+                    hidden_x = F.relu(hidden_x)
+                # By design, our output nodes are contiguous.
+                y[data.seeds[0] : data.seeds[-1] + 1] = hidden_x.to(
+                    buffer_device
+                )
+            if not is_last_layer:
+                features.update("node", None, "feat", y)
+
+        return y
+
+
+def create_dataloader(
+    graph, features, itemset, batch_size, fanout, device, job
+):
+    #####################################################################
+    # (HIGHLIGHT) Create a data loader for efficiently loading graph data.
+    #
+    # - 'ItemSampler' samples mini-batches of node IDs from the dataset.
+    # - 'CopyTo' copies the fetched data to the specified device.
+    # - 'sample_neighbor' performs neighbor sampling on the graph.
+    # - 'FeatureFetcher' fetches node features based on the sampled subgraph.
+
+    #####################################################################
+    # Create a datapipe for mini-batch sampling with a specific neighbor fanout.
+    # Here, [10, 10, 10] specifies the number of neighbors sampled for each node at each layer.
+    # We're using `sample_neighbor` for consistency with DGL's sampling API.
+    # Note: GraphBolt offers additional sampling methods, such as `sample_layer_neighbor`,
+    # which could provide further optimization and efficiency for GNN training.
+    # Users are encouraged to explore these advanced features for potentially improved performance.
+
+    # Initialize an ItemSampler to sample mini-batches from the dataset.
+    datapipe = gb.ItemSampler(
+        itemset,
+        batch_size=batch_size,
+        shuffle=(job == "train"),
+        drop_last=(job == "train"),
+    )
+    need_copy = True
+    # Copy the data to the specified device.
+    if args.graph_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+    # Sample negative edges.
+    if job == "train":
+        datapipe = datapipe.sample_uniform_negative(graph, args.neg_ratio)
+    # Sample neighbors for each node in the mini-batch.
+    datapipe = getattr(datapipe, args.sample_mode)(
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        asynchronous=args.graph_device != "cpu",
+    )
+    if job == "train" and args.exclude_edges:
+        datapipe = datapipe.transform(
+            partial(gb.exclude_seed_edges, include_reverse_edges=True)
+        )
+    # Copy the data to the specified device.
+    if args.feature_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+    # Fetch node features for the sampled subgraph.
+    datapipe = datapipe.fetch_feature(
+        features,
+        node_feature_keys=["feat"],
+        overlap_fetch=args.overlap_feature_fetch,
+    )
+    # Copy the data to the specified device.
+    if need_copy:
+        datapipe = datapipe.copy_to(device=device)
+    # Create and return a DataLoader to handle data loading.
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
+
+
+@torch.compile
+def predictions_step(model, h_src, h_dst):
+    return model.predictor(h_src * h_dst).squeeze()
+
+
+def compute_predictions(model, node_emb, seeds, device):
+    """Compute the predictions for given source and destination nodes.
+
+    This function computes the predictions for a set of node pairs, dividing the
+    task into batches to handle potentially large graphs.
+    """
+
+    preds = torch.empty(seeds.shape[0], device=device)
+    seeds_src, seeds_dst = seeds.T
+    # The constant number is 1001, due to negtive ratio in the `ogbl-citation2`
+    # dataset is 1000.
+    eval_size = args.eval_batch_size * 1001
+    # Loop over node pairs in batches.
+    for start in trange(0, seeds_src.shape[0], eval_size, desc="Evaluate"):
+        end = min(start + eval_size, seeds_src.shape[0])
+
+        # Fetch embeddings for current batch of source and destination nodes.
+        h_src = node_emb[seeds_src[start:end]].to(device, non_blocking=True)
+        h_dst = node_emb[seeds_dst[start:end]].to(device, non_blocking=True)
+
+        # Compute prediction scores using the model.
+        preds[start:end] = predictions_step(model, h_src, h_dst)
+    return preds
+
+
+@torch.no_grad()
+def evaluate(model, graph, features, all_nodes_set, valid_set, test_set):
+    """Evaluate the model on validation and test sets."""
+    model.eval()
+
+    dataloader = create_dataloader(
+        graph,
+        features,
+        all_nodes_set,
+        args.eval_batch_size,
+        [-1],
+        args.device,
+        job="infer",
+    )
+
+    # Compute node embeddings for the entire graph.
+    node_emb = model.inference(graph, features, dataloader, args.feature_device)
+    results = []
+
+    # Loop over both validation and test sets.
+    for split in [valid_set, test_set]:
+        # Unpack the item set.
+        seeds = split._items[0].to(node_emb.device)
+        labels = split._items[1].to(node_emb.device)
+        indexes = split._items[2].to(node_emb.device)
+
+        preds = compute_predictions(model, node_emb, seeds, indexes.device)
+        # Compute MRR values for the current split.
+        results.append(RetrievalMRR()(preds, labels, indexes))
+    return results
+
+
+@torch.compile
+def train_step(minibatch, optimizer, model):
+    node_features = minibatch.node_features["feat"]
+    compacted_seeds = minibatch.compacted_seeds.T
+    labels = minibatch.labels
+    optimizer.zero_grad()
+    y = model(minibatch.sampled_subgraphs, node_features)
+    logits = model.predictor(
+        y[compacted_seeds[0]] * y[compacted_seeds[1]]
+    ).squeeze()
+    loss = F.binary_cross_entropy_with_logits(logits, labels)
+    loss.backward()
+    optimizer.step()
+    return loss.detach(), labels.size(0)
+
+
+def train_helper(dataloader, model, optimizer, device):
+    model.train()  # Set the model to training mode
+    total_loss = torch.zeros(1, device=device)  # Accumulator for the total loss
+    total_samples = 0  # Accumulator for the total number of samples processed
+    start = time.time()
+    for step, minibatch in tqdm(enumerate(dataloader), "Training"):
+        loss, num_samples = train_step(minibatch, optimizer, model)
+        total_loss += loss * num_samples
+        total_samples += num_samples
+        if step + 1 == args.early_stop:
+            break
+    train_loss = total_loss / total_samples
+    end = time.time()
+    return train_loss, end - start
+
+
+def train(dataloader, model, device):
+    #####################################################################
+    # (HIGHLIGHT) Train the model for one epoch.
+    #
+    # - Iterates over the data loader, fetching mini-batches of graph data.
+    # - For each mini-batch, it performs a forward pass, computes loss, and
+    #   updates the model parameters.
+    # - The function returns the average loss and accuracy for the epoch.
+    #
+    # Parameters:
+    #   dataloader: DataLoader that provides mini-batches of graph data.
+    #   model: The GraphSAGE model.
+    #   device: The device (CPU/GPU) to run the training on.
+    #####################################################################
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    for epoch in range(args.epochs):
+        train_loss, duration = train_helper(
+            dataloader, model, optimizer, device
+        )
+        print(
+            f"Epoch {epoch:02d}, Loss: {train_loss.item():.4f}, "
+            f"Time: {duration}s"
+        )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Which dataset are you going to use?"
+    )
+    parser.add_argument(
+        "--epochs", type=int, default=10, help="Number of training epochs."
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=0.003,
+        help="Learning rate for optimization.",
+    )
+    parser.add_argument("--neg-ratio", type=int, default=1)
+    parser.add_argument("--train-batch-size", type=int, default=512)
+    parser.add_argument("--eval-batch-size", type=int, default=1024)
+    parser.add_argument(
+        "--batch-size", type=int, default=1024, help="Batch size for training."
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=0,
+        help="Number of workers for data loading.",
+    )
+    parser.add_argument(
+        "--early-stop",
+        type=int,
+        default=0,
+        help="0 means no early stop, otherwise stop at the input-th step",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="ogbl-citation2",
+        choices=["ogbl-citation2"],
+        help="The dataset we can use for link prediction. Currently"
+        " only ogbl-citation2 dataset is supported.",
+    )
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="10,10,10",
+        help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)"
+        " identical with the number of layers in your model. Default: 10,10,10",
+    )
+    parser.add_argument(
+        "--exclude-edges",
+        type=bool,
+        default=True,
+        help="Whether to exclude reverse edges during sampling. Default: True",
+    )
+    parser.add_argument(
+        "--mode",
+        default="pinned-pinned-cuda",
+        choices=[
+            "cpu-cpu-cpu",
+            "cpu-cpu-cuda",
+            "cpu-pinned-cuda",
+            "pinned-pinned-cuda",
+            "cuda-pinned-cuda",
+            "cuda-cuda-cuda",
+        ],
+        help="Graph storage - feature storage - Train device: 'cpu' for CPU and RAM,"
+        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
+    )
+    parser.add_argument(
+        "--gpu-cache-size",
+        type=int,
+        default=0,
+        help="The capacity of the GPU cache in bytes.",
+    )
+    parser.add_argument(
+        "--sample-mode",
+        default="sample_neighbor",
+        choices=["sample_neighbor", "sample_layer_neighbor"],
+        help="The sampling function when doing layerwise sampling.",
+    )
+    parser.add_argument("--precision", type=str, default="high")
+    return parser.parse_args()
+
+
+def main():
+    torch.set_float32_matmul_precision(args.precision)
+    if not torch.cuda.is_available():
+        args.mode = "cpu-cpu-cpu"
+    print(f"Training in {args.mode} mode.")
+    args.graph_device, args.feature_device, args.device = args.mode.split("-")
+    args.overlap_feature_fetch = args.feature_device == "pinned"
+    args.overlap_graph_fetch = args.graph_device == "pinned"
+
+    # Load and preprocess dataset.
+    print("Loading data...")
+    dataset = gb.BuiltinDataset(args.dataset).load()
+
+    # Move the dataset to the selected storage.
+    graph = (
+        dataset.graph.pin_memory_()
+        if args.graph_device == "pinned"
+        else dataset.graph.to(args.graph_device)
+    )
+    features = (
+        dataset.feature.pin_memory_()
+        if args.feature_device == "pinned"
+        else dataset.feature.to(args.feature_device)
+    )
+
+    train_set = dataset.tasks[0].train_set
+    valid_set = dataset.tasks[0].validation_set
+    test_set = dataset.tasks[0].test_set
+    all_nodes_set = dataset.all_nodes_set
+    args.fanout = list(map(int, args.fanout.split(",")))
+
+    if args.gpu_cache_size > 0 and args.feature_device != "cuda":
+        features._features[("node", None, "feat")] = gb.gpu_cached_feature(
+            features._features[("node", None, "feat")],
+            args.gpu_cache_size,
+        )
+
+    train_dataloader = create_dataloader(
+        graph=graph,
+        features=features,
+        itemset=train_set,
+        batch_size=args.train_batch_size,
+        fanout=args.fanout,
+        device=args.device,
+        job="train",
+    )
+
+    in_channels = features.size("node", None, "feat")[0]
+    hidden_channels = 256
+    model = GraphSAGE(in_channels, hidden_channels, len(args.fanout)).to(
+        args.device
+    )
+    assert len(args.fanout) == len(model.layers)
+
+    train(train_dataloader, model, args.device)
+
+    # Test the model.
+    print("Testing...")
+    valid_mrr, test_mrr = evaluate(
+        model,
+        graph,
+        features,
+        all_nodes_set,
+        valid_set,
+        test_set,
+    )
+    print(
+        f"Validation MRR {valid_mrr.item():.4f}, Test MRR {test_mrr.item():.4f}"
+    )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main()

From eddbfca53ef681fcbf2d8b0779dc65f257ce77cc Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 29 Aug 2024 09:30:32 +0800
Subject: [PATCH 58/78] [DistGB] fix return_eids argument (#7754)

---
 python/dgl/distributed/graph_services.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index af3f90bacc76..cd61a06701be 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -154,14 +154,12 @@ def _sample_neighbors_graphbolt(
         fanout = torch.LongTensor([fanout])
     assert isinstance(fanout, torch.Tensor), "Expect a tensor of fanout."
 
-    return_eids = g.edge_attributes is not None and EID in g.edge_attributes
     subgraph = g._sample_neighbors(
         nodes,
         None,
         fanout,
         replace=replace,
         probs_or_mask=probs_or_mask,
-        return_eids=return_eids,
     )
 
     # 3. Map local node IDs to global node IDs.
@@ -177,7 +175,7 @@ def _sample_neighbors_graphbolt(
     global_dst = global_nid_mapping[local_dst]
 
     global_eids = None
-    if return_eids:
+    if g.edge_attributes is not None and EID in g.edge_attributes:
         global_eids = g.edge_attributes[EID][subgraph.original_edge_ids]
     return LocalSampledGraph(
         global_src, global_dst, global_eids, subgraph.type_per_edge

From 9782c021de667d52f09467ef92b19a43cf8c0e85 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:10:05 +0800
Subject: [PATCH 59/78] [Dist] move DistNode/EdgeDataLoader to distributed
 (#7755)

---
 docs/source/api/python/dgl.dataloading.rst    |   2 -
 docs/source/api/python/dgl.distributed.rst    |   4 +
 docs/source/guide/distributed-apis.rst        |   6 +-
 .../graphsage/node_classification.py          |   4 +-
 .../node_classification_unsupervised.py       |   4 +-
 .../distributed/rgcn/node_classification.py   |   7 +-
 examples/pytorch/graphsage/dist/train_dist.py |   4 +-
 .../graphsage/dist/train_dist_transductive.py |  36 +-
 .../graphsage/dist/train_dist_unsupervised.py |  28 +-
 .../train_dist_unsupervised_transductive.py   |  27 +-
 .../rgcn/experimental/entity_classify_dist.py |   7 +-
 python/dgl/data/citation_graph.py             |   7 +-
 python/dgl/dataloading/__init__.py            |   2 +-
 python/dgl/dataloading/dataloader.py          |  41 +-
 python/dgl/dataloading/dist_dataloader.py     | 702 -----------------
 python/dgl/distributed/__init__.py            |   7 +-
 python/dgl/distributed/dist_dataloader.py     | 703 +++++++++++++++++-
 tests/distributed/test_mp_dataloader.py       |  38 +-
 18 files changed, 843 insertions(+), 786 deletions(-)
 delete mode 100644 python/dgl/dataloading/dist_dataloader.py

diff --git a/docs/source/api/python/dgl.dataloading.rst b/docs/source/api/python/dgl.dataloading.rst
index a292d7d3e8ec..a52e4aa0072f 100644
--- a/docs/source/api/python/dgl.dataloading.rst
+++ b/docs/source/api/python/dgl.dataloading.rst
@@ -26,8 +26,6 @@ DataLoaders
 
     DataLoader
     GraphDataLoader
-    DistNodeDataLoader
-    DistEdgeDataLoader
 
 .. _api-dataloading-neighbor-sampling:
 
diff --git a/docs/source/api/python/dgl.distributed.rst b/docs/source/api/python/dgl.distributed.rst
index 315c77f5a658..7fbd3866315d 100644
--- a/docs/source/api/python/dgl.distributed.rst
+++ b/docs/source/api/python/dgl.distributed.rst
@@ -70,6 +70,10 @@ Distributed DataLoader
 
 .. autoclass:: DistDataLoader
 
+.. autoclass:: DistNodeDataLoader
+
+.. autoclass:: DistEdgeDataLoader
+
 .. _api-distributed-sampling-ops:
 Distributed Graph Sampling Operators
 ```````````````````````````````````````
diff --git a/docs/source/guide/distributed-apis.rst b/docs/source/guide/distributed-apis.rst
index 64c0b3b15354..70a2d53ff247 100644
--- a/docs/source/guide/distributed-apis.rst
+++ b/docs/source/guide/distributed-apis.rst
@@ -275,14 +275,14 @@ difference is that users need to use :func:`dgl.distributed.sample_neighbors` an
 
 The high-level sampling APIs (:class:`~dgl.dataloading.NodeDataLoader` and
 :class:`~dgl.dataloading.EdgeDataLoader` ) has distributed counterparts
-(:class:`~dgl.dataloading.DistNodeDataLoader` and
-:class:`~dgl.dataloading.DistEdgeDataLoader`).  The code is exactly the same as
+(:class:`~dgl.distributed.DistNodeDataLoader` and
+:class:`~dgl.distributed.DistEdgeDataLoader`).  The code is exactly the same as
 single-process sampling otherwise.
 
 .. code:: python
 
     sampler = dgl.sampling.MultiLayerNeighborSampler([10, 25])
-    dataloader = dgl.sampling.DistNodeDataLoader(g, train_nid, sampler,
+    dataloader = dgl.distributed.DistNodeDataLoader(g, train_nid, sampler,
                                                  batch_size=batch_size, shuffle=True)
     for batch in dataloader:
         ...
diff --git a/examples/distributed/graphsage/node_classification.py b/examples/distributed/graphsage/node_classification.py
index 3e4ae6217c50..4ee21985d936 100644
--- a/examples/distributed/graphsage/node_classification.py
+++ b/examples/distributed/graphsage/node_classification.py
@@ -109,7 +109,7 @@ def inference(self, g, x, batch_size, device):
             # `-1` indicates all inbound edges will be inlcuded, namely, full
             # neighbor sampling.
             sampler = dgl.dataloading.NeighborSampler([-1])
-            dataloader = dgl.dataloading.DistNodeDataLoader(
+            dataloader = dgl.distributed.DistNodeDataLoader(
                 g,
                 nodes,
                 sampler,
@@ -212,7 +212,7 @@ def run(args, device, data):
     sampler = dgl.dataloading.NeighborSampler(
         [int(fanout) for fanout in args.fan_out.split(",")]
     )
-    dataloader = dgl.dataloading.DistNodeDataLoader(
+    dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         train_nid,
         sampler,
diff --git a/examples/distributed/graphsage/node_classification_unsupervised.py b/examples/distributed/graphsage/node_classification_unsupervised.py
index 4011bd48c9e4..c940d1a9570a 100644
--- a/examples/distributed/graphsage/node_classification_unsupervised.py
+++ b/examples/distributed/graphsage/node_classification_unsupervised.py
@@ -79,7 +79,7 @@ def inference(self, g, x, batch_size, device):
             # Create sampler
             sampler = dgl.dataloading.NeighborSampler([-1])
             # Create dataloader
-            dataloader = dgl.dataloading.DistNodeDataLoader(
+            dataloader = dgl.distributed.DistNodeDataLoader(
                 g,
                 nodes,
                 sampler,
@@ -203,7 +203,7 @@ def run(args, device, data):
     # Create dataloader
     exclude = "reverse_id" if args.remove_edge else None
     reverse_eids = th.arange(g.num_edges()) if args.remove_edge else None
-    dataloader = dgl.dataloading.DistEdgeDataLoader(
+    dataloader = dgl.distributed.DistEdgeDataLoader(
         g,
         train_eids,
         sampler,
diff --git a/examples/distributed/rgcn/node_classification.py b/examples/distributed/rgcn/node_classification.py
index 8200d117f769..7bf74f83ecd5 100644
--- a/examples/distributed/rgcn/node_classification.py
+++ b/examples/distributed/rgcn/node_classification.py
@@ -6,6 +6,7 @@
 * l2norm applied to all weights
 * remove nodes that won't be touched
 """
+
 import argparse
 import gc, os
 import itertools
@@ -459,7 +460,7 @@ def run(args, device, data):
     val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
 
     sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DistNodeDataLoader(
+    dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": train_nid},
         sampler,
@@ -469,7 +470,7 @@ def run(args, device, data):
     )
 
     valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    valid_dataloader = dgl.dataloading.DistNodeDataLoader(
+    valid_dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": val_nid},
         valid_sampler,
@@ -479,7 +480,7 @@ def run(args, device, data):
     )
 
     test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    test_dataloader = dgl.dataloading.DistNodeDataLoader(
+    test_dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": test_nid},
         test_sampler,
diff --git a/examples/pytorch/graphsage/dist/train_dist.py b/examples/pytorch/graphsage/dist/train_dist.py
index 6ed6570855ef..d2f192ac4f99 100644
--- a/examples/pytorch/graphsage/dist/train_dist.py
+++ b/examples/pytorch/graphsage/dist/train_dist.py
@@ -88,7 +88,7 @@ def inference(self, g, x, batch_size, device):
             print(f"|V|={g.num_nodes()}, eval batch size: {batch_size}")
 
             sampler = dgl.dataloading.NeighborSampler([-1])
-            dataloader = dgl.dataloading.DistNodeDataLoader(
+            dataloader = dgl.distributed.DistNodeDataLoader(
                 g,
                 nodes,
                 sampler,
@@ -153,7 +153,7 @@ def run(args, device, data):
     sampler = dgl.dataloading.NeighborSampler(
         [int(fanout) for fanout in args.fan_out.split(",")]
     )
-    dataloader = dgl.dataloading.DistNodeDataLoader(
+    dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         train_nid,
         sampler,
diff --git a/examples/pytorch/graphsage/dist/train_dist_transductive.py b/examples/pytorch/graphsage/dist/train_dist_transductive.py
index ff1b1795a8eb..903b833b8de9 100644
--- a/examples/pytorch/graphsage/dist/train_dist_transductive.py
+++ b/examples/pytorch/graphsage/dist/train_dist_transductive.py
@@ -1,14 +1,15 @@
 import argparse
 import time
 
+import dgl
+
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import dgl
 from dgl.distributed import DistEmbedding
-from train_dist import DistSAGE, compute_acc
+from train_dist import compute_acc, DistSAGE
 
 
 def initializer(shape, dtype):
@@ -18,9 +19,7 @@ def initializer(shape, dtype):
 
 
 class DistEmb(nn.Module):
-    def __init__(
-            self, num_nodes, emb_size, dgl_sparse_emb=False, dev_id="cpu"
-    ):
+    def __init__(self, num_nodes, emb_size, dgl_sparse_emb=False, dev_id="cpu"):
         super().__init__()
         self.dev_id = dev_id
         self.emb_size = emb_size
@@ -49,9 +48,11 @@ def load_embs(standalone, emb_layer, g):
     x = dgl.distributed.DistTensor(
         (
             g.num_nodes(),
-            emb_layer.module.emb_size
-            if isinstance(emb_layer, th.nn.parallel.DistributedDataParallel)
-            else emb_layer.emb_size,
+            (
+                emb_layer.module.emb_size
+                if isinstance(emb_layer, th.nn.parallel.DistributedDataParallel)
+                else emb_layer.emb_size
+            ),
         ),
         th.float32,
         "eval_embs",
@@ -60,7 +61,7 @@ def load_embs(standalone, emb_layer, g):
     num_nodes = nodes.shape[0]
     for i in range((num_nodes + 1023) // 1024):
         idx = nodes[
-            i * 1024: (i + 1) * 1024
+            i * 1024 : (i + 1) * 1024
             if (i + 1) * 1024 < num_nodes
             else num_nodes
         ]
@@ -113,7 +114,7 @@ def run(args, device, data):
     sampler = dgl.dataloading.NeighborSampler(
         [int(fanout) for fanout in args.fan_out.split(",")]
     )
-    dataloader = dgl.dataloading.DistNodeDataLoader(
+    dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         train_nid,
         sampler,
@@ -164,10 +165,7 @@ def run(args, device, data):
         emb_optimizer = th.optim.SparseAdam(
             list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr
         )
-        print(
-            "optimize Pytorch sparse embedding:",
-            emb_layer.module.sparse_emb
-        )
+        print("optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb)
 
     # Training loop
     iter_tput = []
@@ -231,7 +229,7 @@ def run(args, device, data):
                             acc.item(),
                             np.mean(iter_tput[3:]),
                             gpu_mem_alloc,
-                            np.sum(step_time[-args.log_every:]),
+                            np.sum(step_time[-args.log_every :]),
                         )
                     )
                 start = time.time()
@@ -267,8 +265,7 @@ def run(args, device, data):
                 device,
             )
             print(
-                "Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format
-                (
+                "Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format(
                     g.rank(), val_acc, test_acc, time.time() - start
                 )
             )
@@ -278,10 +275,7 @@ def main(args):
     dgl.distributed.initialize(args.ip_config)
     if not args.standalone:
         th.distributed.init_process_group(backend="gloo")
-    g = dgl.distributed.DistGraph(
-            args.graph_name,
-            part_config=args.part_config
-        )
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
     print("rank:", g.rank())
 
     pb = g.get_partition_book()
diff --git a/examples/pytorch/graphsage/dist/train_dist_unsupervised.py b/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
index c337f288c3fd..a28900076a6e 100644
--- a/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
+++ b/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
@@ -2,6 +2,10 @@
 import time
 from contextlib import contextmanager
 
+import dgl
+import dgl.function as fn
+import dgl.nn.pytorch as dglnn
+
 import numpy as np
 import sklearn.linear_model as lm
 import sklearn.metrics as skm
@@ -10,9 +14,7 @@
 import torch.nn.functional as F
 import torch.optim as optim
 import tqdm
-import dgl
-import dgl.function as fn
-import dgl.nn.pytorch as dglnn
+
 
 class DistSAGE(nn.Module):
     def __init__(
@@ -77,7 +79,7 @@ def inference(self, g, x, batch_size, device):
             # Create sampler
             sampler = dgl.dataloading.NeighborSampler([-1])
             # Create dataloader
-            dataloader = dgl.dataloading.DistNodeDataLoader(
+            dataloader = dgl.distributed.DistNodeDataLoader(
                 g,
                 nodes,
                 sampler,
@@ -201,7 +203,7 @@ def run(args, device, data):
     # Create dataloader
     exclude = "reverse_id" if args.remove_edge else None
     reverse_eids = th.arange(g.num_edges()) if args.remove_edge else None
-    dataloader = dgl.dataloading.DistEdgeDataLoader(
+    dataloader = dgl.distributed.DistEdgeDataLoader(
         g,
         train_eids,
         sampler,
@@ -297,12 +299,12 @@ def run(args, device, data):
                             step,
                             loss.item(),
                             np.mean(iter_tput[3:]),
-                            np.sum(step_time[-args.log_every:]),
-                            np.sum(sample_t[-args.log_every:]),
-                            np.sum(feat_copy_t[-args.log_every:]),
-                            np.sum(forward_t[-args.log_every:]),
-                            np.sum(backward_t[-args.log_every:]),
-                            np.sum(update_t[-args.log_every:]),
+                            np.sum(step_time[-args.log_every :]),
+                            np.sum(sample_t[-args.log_every :]),
+                            np.sum(feat_copy_t[-args.log_every :]),
+                            np.sum(forward_t[-args.log_every :]),
+                            np.sum(backward_t[-args.log_every :]),
+                            np.sum(update_t[-args.log_every :]),
                         )
                     )
                 start = time.time()
@@ -356,9 +358,7 @@ def main(args):
     dgl.distributed.initialize(args.ip_config)
     if not args.standalone:
         th.distributed.init_process_group(backend="gloo")
-    g = dgl.distributed.DistGraph(
-            args.graph_name, part_config=args.part_config
-        )
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
     print("rank:", g.rank())
     print("number of edges", g.num_edges())
 
diff --git a/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py b/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
index 3f881ae76015..0eb4345501ee 100644
--- a/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
+++ b/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
@@ -1,13 +1,14 @@
 import argparse
 import time
 
+import dgl
+
 import numpy as np
 import torch as th
 import torch.nn.functional as F
 import torch.optim as optim
-import dgl
 from train_dist_transductive import DistEmb, load_embs
-from train_dist_unsupervised import CrossEntropyLoss, DistSAGE, compute_acc
+from train_dist_unsupervised import compute_acc, CrossEntropyLoss, DistSAGE
 
 
 def generate_emb(standalone, model, emb_layer, g, batch_size, device):
@@ -49,7 +50,7 @@ def run(args, device, data):
     # Create dataloader
     exclude = "reverse_id" if args.remove_edge else None
     reverse_eids = th.arange(g.num_edges()) if args.remove_edge else None
-    dataloader = dgl.dataloading.DistEdgeDataLoader(
+    dataloader = dgl.distributed.DistEdgeDataLoader(
         g,
         train_eids,
         sampler,
@@ -104,9 +105,7 @@ def run(args, device, data):
         emb_optimizer = th.optim.SparseAdam(
             list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr
         )
-        print(
-            "optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb
-        )
+        print("optimize Pytorch sparse embedding:", emb_layer.module.sparse_emb)
 
     # Training loop
     epoch = 0
@@ -172,12 +171,12 @@ def run(args, device, data):
                             step,
                             loss.item(),
                             np.mean(iter_tput[3:]),
-                            np.sum(step_time[-args.log_every:]),
-                            np.sum(sample_t[-args.log_every:]),
-                            np.sum(feat_copy_t[-args.log_every:]),
-                            np.sum(forward_t[-args.log_every:]),
-                            np.sum(backward_t[-args.log_every:]),
-                            np.sum(update_t[-args.log_every:]),
+                            np.sum(step_time[-args.log_every :]),
+                            np.sum(sample_t[-args.log_every :]),
+                            np.sum(feat_copy_t[-args.log_every :]),
+                            np.sum(forward_t[-args.log_every :]),
+                            np.sum(backward_t[-args.log_every :]),
+                            np.sum(update_t[-args.log_every :]),
                         )
                     )
 
@@ -228,9 +227,7 @@ def main(args):
     dgl.distributed.initialize(args.ip_config)
     if not args.standalone:
         th.distributed.init_process_group(backend="gloo")
-    g = dgl.distributed.DistGraph(
-            args.graph_name, part_config=args.part_config
-        )
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
     print("rank:", g.rank())
     print("number of edges", g.num_edges())
 
diff --git a/examples/pytorch/rgcn/experimental/entity_classify_dist.py b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
index 89093ede8a8b..9e1b74c02cf6 100644
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -6,6 +6,7 @@
 * l2norm applied to all weights
 * remove nodes that won't be touched
 """
+
 import argparse
 import gc, os
 import itertools
@@ -459,7 +460,7 @@ def run(args, device, data):
     val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
 
     sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DistNodeDataLoader(
+    dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": train_nid},
         sampler,
@@ -469,7 +470,7 @@ def run(args, device, data):
     )
 
     valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    valid_dataloader = dgl.dataloading.DistNodeDataLoader(
+    valid_dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": val_nid},
         valid_sampler,
@@ -479,7 +480,7 @@ def run(args, device, data):
     )
 
     test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    test_dataloader = dgl.dataloading.DistNodeDataLoader(
+    test_dataloader = dgl.distributed.DistNodeDataLoader(
         g,
         {"paper": test_nid},
         test_sampler,
diff --git a/python/dgl/data/citation_graph.py b/python/dgl/data/citation_graph.py
index 34af53ac2845..d82c55e51a2a 100644
--- a/python/dgl/data/citation_graph.py
+++ b/python/dgl/data/citation_graph.py
@@ -3,6 +3,7 @@
 (lingfan): following dataset loading and preprocessing code from tkipf/gcn
 https://github.com/tkipf/gcn/blob/master/gcn/utils.py
 """
+
 from __future__ import absolute_import
 
 import os, sys
@@ -14,7 +15,8 @@
 import numpy as np
 import scipy.sparse as sp
 
-from .. import backend as F, batch, convert
+from .. import backend as F, convert
+from ..batch import batch as batch_graphs
 from ..convert import from_networkx, graph as dgl_graph, to_networkx
 from ..transforms import reorder_graph
 from .dgl_dataset import DGLBuiltinDataset
@@ -67,6 +69,7 @@ class CitationGraphDataset(DGLBuiltinDataset):
     reorder : bool
         Whether to reorder the graph using :func:`~dgl.reorder_graph`. Default: False.
     """
+
     _urls = {
         "cora_v2": "dataset/cora_v2.zip",
         "citeseer": "dataset/citeseer.zip",
@@ -922,7 +925,7 @@ def save_name(self):
     @staticmethod
     def collate_fn(cur):
         graphs, pmpds, labels = zip(*cur)
-        batched_graphs = batch.batch(graphs)
+        batched_graphs = batch_graphs(graphs)
         batched_pmpds = sp.block_diag(pmpds)
         batched_labels = np.concatenate(labels, axis=0)
         return batched_graphs, batched_pmpds, batched_labels
diff --git a/python/dgl/dataloading/__init__.py b/python/dgl/dataloading/__init__.py
index 1345c6303858..cfeaa7d08600 100644
--- a/python/dgl/dataloading/__init__.py
+++ b/python/dgl/dataloading/__init__.py
@@ -1,4 +1,5 @@
 """Package for dataloaders and samplers."""
+
 from .. import backend as F
 from . import negative_sampler
 from .base import *
@@ -11,4 +12,3 @@
 if F.get_preferred_backend() == "pytorch":
     from .spot_target import *
     from .dataloader import *
-    from .dist_dataloader import *
diff --git a/python/dgl/dataloading/dataloader.py b/python/dgl/dataloading/dataloader.py
index 1c5481bc6b78..7231e7299e2f 100644
--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
@@ -1,4 +1,5 @@
 """DGL PyTorch DataLoaders"""
+
 import atexit
 import inspect
 import itertools
@@ -24,7 +25,6 @@
 from ..base import dgl_warning, DGLError, EID, NID
 from ..batch import batch as batch_graphs
 from ..cuda import GPUCache
-from ..distributed import DistGraph
 from ..frame import LazyFeature
 from ..heterograph import DGLGraph
 from ..storages import wrap_storage
@@ -970,11 +970,6 @@ def __init__(
             super().__init__(**kwargs)
             return
 
-        if isinstance(graph, DistGraph):
-            raise TypeError(
-                "Please use dgl.dataloading.DistNodeDataLoader or "
-                "dgl.datalaoding.DistEdgeDataLoader for DistGraphs."
-            )
         # (BarclayII) I hoped that pin_prefetcher can be merged into PyTorch's native
         # pin_memory argument.  But our neighbor samplers and subgraph samplers
         # return indices, which could be CUDA tensors (e.g. during UVA sampling)
@@ -1477,3 +1472,37 @@ def set_epoch(self, epoch):
             self.dist_sampler.set_epoch(epoch)
         else:
             raise DGLError("set_epoch is only available when use_ddp is True.")
+
+
+class DistNodeDataLoader:
+    """Deprecated. Please use :class:`~dgl.distributed.DistNodeDataLoader`
+    instead.
+    """
+
+    def __new__(cls, *args, **kwargs):
+        dgl_warning(
+            "dgl.dataloading.DistNodeDataLoader has been moved to "
+            "dgl.distributed.DistNodeDataLoader. This old class is deprecated "
+            "and will be removed soon. Please update your code to use the new "
+            "class."
+        )
+        from ..distributed import DistNodeDataLoader as NewDistNodeDataLoader
+
+        return NewDistNodeDataLoader(*args, **kwargs)
+
+
+class DistEdgeDataLoader:
+    """Deprecated. Please use :class:`~dgl.distributed.DistEdgeDataLoader`
+    instead.
+    """
+
+    def __new__(cls, *args, **kwargs):
+        dgl_warning(
+            "dgl.dataloading.DistEdgeDataLoader has been moved to "
+            "dgl.distributed.DistEdgeDataLoader. This old class is deprecated "
+            "and will be removed soon. Please update your code to use the new "
+            "class."
+        )
+        from ..distributed import DistEdgeDataLoader as NewDistEdgeDataLoader
+
+        return NewDistEdgeDataLoader(*args, **kwargs)
diff --git a/python/dgl/dataloading/dist_dataloader.py b/python/dgl/dataloading/dist_dataloader.py
deleted file mode 100644
index faf6a8a7a082..000000000000
--- a/python/dgl/dataloading/dist_dataloader.py
+++ /dev/null
@@ -1,702 +0,0 @@
-"""Distributed dataloaders.
-"""
-
-import inspect
-from abc import ABC, abstractmethod, abstractproperty
-from collections.abc import Mapping
-
-from .. import backend as F, transforms, utils
-from ..base import EID, NID
-from ..convert import heterograph
-from ..distributed import DistDataLoader
-
-# [Note] As implementation of ``dgl.distributed.DistDataLoader`` is independent
-# of ``dgl.dataloading.DataLoader`` currently, dedicated collators are defined
-# here instead of using ``dgl.dataloading.CollateWrapper``.
-
-
-def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map):
-    if isinstance(eids, Mapping):
-        eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
-        exclude_eids = {
-            k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0)
-            for k, v in eids.items()
-        }
-    else:
-        exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0)
-    return exclude_eids
-
-
-def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
-    exclude_eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
-    reverse_etype_map = {
-        g.to_canonical_etype(k): g.to_canonical_etype(v)
-        for k, v in reverse_etype_map.items()
-    }
-    exclude_eids.update(
-        {reverse_etype_map[k]: v for k, v in exclude_eids.items()}
-    )
-    return exclude_eids
-
-
-def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
-    """Find all edge IDs to exclude according to :attr:`exclude_mode`.
-
-    Parameters
-    ----------
-    g : DGLGraph
-        The graph.
-    exclude_mode : str, optional
-        Can be either of the following,
-
-        None (default)
-            Does not exclude any edge.
-
-        'self'
-            Exclude the given edges themselves but nothing else.
-
-        'reverse_id'
-            Exclude all edges specified in ``eids``, as well as their reverse edges
-            of the same edge type.
-
-            The mapping from each edge ID to its reverse edge ID is specified in
-            the keyword argument ``reverse_eid_map``.
-
-            This mode assumes that the reverse of an edge with ID ``e`` and type
-            ``etype`` will have ID ``reverse_eid_map[e]`` and type ``etype``.
-
-        'reverse_types'
-            Exclude all edges specified in ``eids``, as well as their reverse
-            edges of the corresponding edge types.
-
-            The mapping from each edge type to its reverse edge type is specified
-            in the keyword argument ``reverse_etype_map``.
-
-            This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
-            will have ID ``e`` and type ``reverse_etype_map[etype]``.
-    eids : Tensor or dict[etype, Tensor]
-        The edge IDs.
-    reverse_eid_map : Tensor or dict[etype, Tensor]
-        The mapping from edge ID to its reverse edge ID.
-    reverse_etype_map : dict[etype, etype]
-        The mapping from edge etype to its reverse edge type.
-    """
-    if exclude_mode is None:
-        return None
-    elif exclude_mode == "self":
-        if isinstance(eids, Mapping):
-            eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
-        return eids
-    elif exclude_mode == "reverse_id":
-        return _find_exclude_eids_with_reverse_id(
-            g, eids, kwargs["reverse_eid_map"]
-        )
-    elif exclude_mode == "reverse_types":
-        return _find_exclude_eids_with_reverse_types(
-            g, eids, kwargs["reverse_etype_map"]
-        )
-    else:
-        raise ValueError("unsupported mode {}".format(exclude_mode))
-
-
-class Collator(ABC):
-    """Abstract DGL collator for training GNNs on downstream tasks stochastically.
-
-    Provides a :attr:`dataset` object containing the collection of all nodes or edges,
-    as well as a :attr:`collate` method that combines a set of items from
-    :attr:`dataset` and obtains the message flow graphs (MFGs).
-
-    Notes
-    -----
-    For the concept of MFGs, please refer to
-    :ref:`User Guide Section 6 <guide-minibatch>` and
-    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
-    """
-
-    @abstractproperty
-    def dataset(self):
-        """Returns the dataset object of the collator."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def collate(self, items):
-        """Combines the items from the dataset object and obtains the list of MFGs.
-
-        Parameters
-        ----------
-        items : list[str, int]
-            The list of node or edge IDs or type-ID pairs.
-
-        Notes
-        -----
-        For the concept of MFGs, please refer to
-        :ref:`User Guide Section 6 <guide-minibatch>` and
-        :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def add_edge_attribute_to_graph(g, data_name):
-        """Add data into the graph as an edge attribute.
-
-        For some cases such as prob/mask-based sampling on GraphBolt partitions,
-        we need to prepare such data beforehand. This is because data are
-        usually saved in DistGraph.ndata/edata, but such data is not in the
-        format that GraphBolt partitions require. And in GraphBolt, such data
-        are saved as edge attributes. So we need to add such data into the graph
-        before any sampling is kicked off.
-
-        Parameters
-        ----------
-        g : DistGraph
-            The graph.
-        data_name : str
-            The name of data that's stored in DistGraph.ndata/edata.
-        """
-        if g._use_graphbolt and data_name:
-            g.add_edge_attribute(data_name)
-
-
-class NodeCollator(Collator):
-    """DGL collator to combine nodes and their computation dependencies within a minibatch for
-    training node classification or regression on a single graph with neighborhood sampling.
-
-    Parameters
-    ----------
-    g : DGLGraph
-        The graph.
-    nids : Tensor or dict[ntype, Tensor]
-        The node set to compute outputs.
-    graph_sampler : dgl.dataloading.BlockSampler
-        The neighborhood sampler.
-
-    Examples
-    --------
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from all neighbors (assume
-    the backend is PyTorch):
-
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> collator = dgl.dataloading.NodeCollator(g, train_nid, sampler)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(input_nodes, output_nodes, blocks)
-
-    Notes
-    -----
-    For the concept of MFGs, please refer to
-    :ref:`User Guide Section 6 <guide-minibatch>` and
-    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
-    """
-
-    def __init__(self, g, nids, graph_sampler):
-        self.g = g
-        if not isinstance(nids, Mapping):
-            assert (
-                len(g.ntypes) == 1
-            ), "nids should be a dict of node type and ids for graph with multiple node types"
-        self.graph_sampler = graph_sampler
-
-        self.nids = utils.prepare_tensor_or_dict(g, nids, "nids")
-        self._dataset = utils.maybe_flatten_dict(self.nids)
-
-        # Add prob/mask into graphbolt partition's edge attributes if needed.
-        Collator.add_edge_attribute_to_graph(self.g, self.graph_sampler.prob)
-
-    @property
-    def dataset(self):
-        return self._dataset
-
-    def collate(self, items):
-        """Find the list of MFGs necessary for computing the representation of given
-        nodes for a node classification/regression task.
-
-        Parameters
-        ----------
-        items : list[int] or list[tuple[str, int]]
-            Either a list of node IDs (for homogeneous graphs), or a list of node type-ID
-            pairs (for heterogeneous graphs).
-
-        Returns
-        -------
-        input_nodes : Tensor or dict[ntype, Tensor]
-            The input nodes necessary for computation in this minibatch.
-
-            If the original graph has multiple node types, return a dictionary of
-            node type names and node ID tensors.  Otherwise, return a single tensor.
-        output_nodes : Tensor or dict[ntype, Tensor]
-            The nodes whose representations are to be computed in this minibatch.
-
-            If the original graph has multiple node types, return a dictionary of
-            node type names and node ID tensors.  Otherwise, return a single tensor.
-        MFGs : list[DGLGraph]
-            The list of MFGs necessary for computing the representation.
-        """
-        if isinstance(items[0], tuple):
-            # returns a list of pairs: group them by node types into a dict
-            items = utils.group_as_dict(items)
-        items = utils.prepare_tensor_or_dict(self.g, items, "items")
-
-        input_nodes, output_nodes, blocks = self.graph_sampler.sample_blocks(
-            self.g, items
-        )
-
-        return input_nodes, output_nodes, blocks
-
-
-class EdgeCollator(Collator):
-    """DGL collator to combine edges and their computation dependencies within a minibatch for
-    training edge classification, edge regression, or link prediction on a single graph
-    with neighborhood sampling.
-
-    Given a set of edges, the collate function will yield
-
-    * A tensor of input nodes necessary for computing the representation on edges, or
-      a dictionary of node type names and such tensors.
-
-    * A subgraph that contains only the edges in the minibatch and their incident nodes.
-      Note that the graph has an identical metagraph with the original graph.
-
-    * If a negative sampler is given, another graph that contains the "negative edges",
-      connecting the source and destination nodes yielded from the given negative sampler.
-
-    * A list of MFGs necessary for computing the representation of the incident nodes
-      of the edges in the minibatch.
-
-    Parameters
-    ----------
-    g : DGLGraph
-        The graph from which the edges are iterated in minibatches and the subgraphs
-        are generated.
-    eids : Tensor or dict[etype, Tensor]
-        The edge set in graph :attr:`g` to compute outputs.
-    graph_sampler : dgl.dataloading.BlockSampler
-        The neighborhood sampler.
-    g_sampling : DGLGraph, optional
-        The graph where neighborhood sampling and message passing is performed.
-
-        Note that this is not necessarily the same as :attr:`g`.
-
-        If None, assume to be the same as :attr:`g`.
-    exclude : str, optional
-        Whether and how to exclude dependencies related to the sampled edges in the
-        minibatch.  Possible values are
-
-        * None, which excludes nothing.
-
-        * ``'self'``, which excludes the sampled edges themselves but nothing else.
-
-        * ``'reverse_id'``, which excludes the reverse edges of the sampled edges.  The said
-          reverse edges have the same edge type as the sampled edges.  Only works
-          on edge types whose source node type is the same as its destination node type.
-
-        * ``'reverse_types'``, which excludes the reverse edges of the sampled edges.  The
-          said reverse edges have different edge types from the sampled edges.
-
-        If ``g_sampling`` is given, ``exclude`` is ignored and will be always ``None``.
-    reverse_eids : Tensor or dict[etype, Tensor], optional
-        A tensor of reverse edge ID mapping.  The i-th element indicates the ID of
-        the i-th edge's reverse edge.
-
-        If the graph is heterogeneous, this argument requires a dictionary of edge
-        types and the reverse edge ID mapping tensors.
-
-        Required and only used when ``exclude`` is set to ``reverse_id``.
-
-        For heterogeneous graph this will be a dict of edge type and edge IDs.  Note that
-        only the edge types whose source node type is the same as destination node type
-        are needed.
-    reverse_etypes : dict[etype, etype], optional
-        The mapping from the edge type to its reverse edge type.
-
-        Required and only used when ``exclude`` is set to ``reverse_types``.
-    negative_sampler : callable, optional
-        The negative sampler.  Can be omitted if no negative sampling is needed.
-
-        The negative sampler must be a callable that takes in the following arguments:
-
-        * The original (heterogeneous) graph.
-
-        * The ID array of sampled edges in the minibatch, or the dictionary of edge
-          types and ID array of sampled edges in the minibatch if the graph is
-          heterogeneous.
-
-        It should return
-
-        * A pair of source and destination node ID arrays as negative samples,
-          or a dictionary of edge types and such pairs if the graph is heterogenenous.
-
-        A set of builtin negative samplers are provided in
-        :ref:`the negative sampling module <api-dataloading-negative-sampling>`.
-
-    Examples
-    --------
-    The following example shows how to train a 3-layer GNN for edge classification on a
-    set of edges ``train_eid`` on a homogeneous undirected graph. Each node takes
-    messages from all neighbors.
-
-    Say that you have an array of source node IDs ``src`` and another array of destination
-    node IDs ``dst``.  One can make it bidirectional by adding another set of edges
-    that connects from ``dst`` to ``src``:
-
-    >>> g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
-
-    One can then know that the ID difference of an edge and its reverse edge is ``|E|``,
-    where ``|E|`` is the length of your source/destination array.  The reverse edge
-    mapping can be obtained by
-
-    >>> E = len(src)
-    >>> reverse_eids = torch.cat([torch.arange(E, 2 * E), torch.arange(0, E)])
-
-    Note that the sampled edges as well as their reverse edges are removed from
-    computation dependencies of the incident nodes.  This is a common trick to avoid
-    information leakage.
-
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> collator = dgl.dataloading.EdgeCollator(
-    ...     g, train_eid, sampler, exclude='reverse_id',
-    ...     reverse_eids=reverse_eids)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, blocks)
-
-    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` on a
-    homogeneous graph where each node takes messages from all neighbors (assume the
-    backend is PyTorch), with 5 uniformly chosen negative samples per edge:
-
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
-    >>> collator = dgl.dataloading.EdgeCollator(
-    ...     g, train_eid, sampler, exclude='reverse_id',
-    ...     reverse_eids=reverse_eids, negative_sampler=neg_sampler)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
-    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
-
-    For heterogeneous graphs, the reverse of an edge may have a different edge type
-    from the original edge.  For instance, consider that you have an array of
-    user-item clicks, representated by a user array ``user`` and an item array ``item``.
-    You may want to build a heterogeneous graph with a user-click-item relation and an
-    item-clicked-by-user relation.
-
-    >>> g = dgl.heterograph({
-    ...     ('user', 'click', 'item'): (user, item),
-    ...     ('item', 'clicked-by', 'user'): (item, user)})
-
-    To train a 3-layer GNN for edge classification on a set of edges ``train_eid`` with
-    type ``click``, you can write
-
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> collator = dgl.dataloading.EdgeCollator(
-    ...     g, {'click': train_eid}, sampler, exclude='reverse_types',
-    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'})
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, blocks)
-
-    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` with type
-    ``click``, you can write
-
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
-    >>> collator = dgl.dataloading.EdgeCollator(
-    ...     g, train_eid, sampler, exclude='reverse_types',
-    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
-    ...     negative_sampler=neg_sampler)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, neg_pair_graph, blocks)
-
-    Notes
-    -----
-    For the concept of MFGs, please refer to
-    :ref:`User Guide Section 6 <guide-minibatch>` and
-    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
-    """
-
-    def __init__(
-        self,
-        g,
-        eids,
-        graph_sampler,
-        g_sampling=None,
-        exclude=None,
-        reverse_eids=None,
-        reverse_etypes=None,
-        negative_sampler=None,
-    ):
-        self.g = g
-        if not isinstance(eids, Mapping):
-            assert (
-                len(g.etypes) == 1
-            ), "eids should be a dict of etype and ids for graph with multiple etypes"
-        self.graph_sampler = graph_sampler
-
-        # One may wish to iterate over the edges in one graph while perform sampling in
-        # another graph.  This may be the case for iterating over validation and test
-        # edge set while perform neighborhood sampling on the graph formed by only
-        # the training edge set.
-        # See GCMC for an example usage.
-        if g_sampling is not None:
-            self.g_sampling = g_sampling
-            self.exclude = None
-        else:
-            self.g_sampling = self.g
-            self.exclude = exclude
-
-        self.reverse_eids = reverse_eids
-        self.reverse_etypes = reverse_etypes
-        self.negative_sampler = negative_sampler
-
-        self.eids = utils.prepare_tensor_or_dict(g, eids, "eids")
-        self._dataset = utils.maybe_flatten_dict(self.eids)
-
-        # Add prob/mask into graphbolt partition's edge attributes if needed.
-        Collator.add_edge_attribute_to_graph(self.g, self.graph_sampler.prob)
-
-    @property
-    def dataset(self):
-        return self._dataset
-
-    def _collate(self, items):
-        if isinstance(items[0], tuple):
-            # returns a list of pairs: group them by node types into a dict
-            items = utils.group_as_dict(items)
-        items = utils.prepare_tensor_or_dict(self.g_sampling, items, "items")
-
-        pair_graph = self.g.edge_subgraph(items)
-        seed_nodes = pair_graph.ndata[NID]
-
-        exclude_eids = _find_exclude_eids(
-            self.g_sampling,
-            self.exclude,
-            items,
-            reverse_eid_map=self.reverse_eids,
-            reverse_etype_map=self.reverse_etypes,
-        )
-
-        input_nodes, _, blocks = self.graph_sampler.sample_blocks(
-            self.g_sampling, seed_nodes, exclude_eids=exclude_eids
-        )
-
-        return input_nodes, pair_graph, blocks
-
-    def _collate_with_negative_sampling(self, items):
-        if isinstance(items[0], tuple):
-            # returns a list of pairs: group them by node types into a dict
-            items = utils.group_as_dict(items)
-        items = utils.prepare_tensor_or_dict(self.g_sampling, items, "items")
-
-        pair_graph = self.g.edge_subgraph(items, relabel_nodes=False)
-        induced_edges = pair_graph.edata[EID]
-
-        neg_srcdst = self.negative_sampler(self.g, items)
-        if not isinstance(neg_srcdst, Mapping):
-            assert len(self.g.etypes) == 1, (
-                "graph has multiple or no edge types; "
-                "please return a dict in negative sampler."
-            )
-            neg_srcdst = {self.g.canonical_etypes[0]: neg_srcdst}
-        # Get dtype from a tuple of tensors
-        dtype = F.dtype(list(neg_srcdst.values())[0][0])
-        ctx = F.context(pair_graph)
-        neg_edges = {
-            etype: neg_srcdst.get(
-                etype,
-                (
-                    F.copy_to(F.tensor([], dtype), ctx),
-                    F.copy_to(F.tensor([], dtype), ctx),
-                ),
-            )
-            for etype in self.g.canonical_etypes
-        }
-        neg_pair_graph = heterograph(
-            neg_edges,
-            {ntype: self.g.num_nodes(ntype) for ntype in self.g.ntypes},
-        )
-
-        pair_graph, neg_pair_graph = transforms.compact_graphs(
-            [pair_graph, neg_pair_graph]
-        )
-        pair_graph.edata[EID] = induced_edges
-
-        seed_nodes = pair_graph.ndata[NID]
-
-        exclude_eids = _find_exclude_eids(
-            self.g_sampling,
-            self.exclude,
-            items,
-            reverse_eid_map=self.reverse_eids,
-            reverse_etype_map=self.reverse_etypes,
-        )
-
-        input_nodes, _, blocks = self.graph_sampler.sample_blocks(
-            self.g_sampling, seed_nodes, exclude_eids=exclude_eids
-        )
-
-        return input_nodes, pair_graph, neg_pair_graph, blocks
-
-    def collate(self, items):
-        """Combines the sampled edges into a minibatch for edge classification, edge
-        regression, and link prediction tasks.
-
-        Parameters
-        ----------
-        items : list[int] or list[tuple[str, int]]
-            Either a list of edge IDs (for homogeneous graphs), or a list of edge type-ID
-            pairs (for heterogeneous graphs).
-
-        Returns
-        -------
-        Either ``(input_nodes, pair_graph, blocks)``, or
-        ``(input_nodes, pair_graph, negative_pair_graph, blocks)`` if negative sampling is
-        enabled.
-
-        input_nodes : Tensor or dict[ntype, Tensor]
-            The input nodes necessary for computation in this minibatch.
-
-            If the original graph has multiple node types, return a dictionary of
-            node type names and node ID tensors.  Otherwise, return a single tensor.
-        pair_graph : DGLGraph
-            The graph that contains only the edges in the minibatch as well as their incident
-            nodes.
-
-            Note that the metagraph of this graph will be identical to that of the original
-            graph.
-        negative_pair_graph : DGLGraph
-            The graph that contains only the edges connecting the source and destination nodes
-            yielded from the given negative sampler, if negative sampling is enabled.
-
-            Note that the metagraph of this graph will be identical to that of the original
-            graph.
-        blocks : list[DGLGraph]
-            The list of MFGs necessary for computing the representation of the edges.
-        """
-        if self.negative_sampler is None:
-            return self._collate(items)
-        else:
-            return self._collate_with_negative_sampling(items)
-
-
-def _remove_kwargs_dist(kwargs):
-    if "num_workers" in kwargs:
-        del kwargs["num_workers"]
-    if "pin_memory" in kwargs:
-        del kwargs["pin_memory"]
-        print("Distributed DataLoaders do not support pin_memory.")
-    return kwargs
-
-
-class DistNodeDataLoader(DistDataLoader):
-    """Sampled graph data loader over nodes for distributed graph storage.
-
-    It wraps an iterable over a set of nodes, generating the list
-    of message flow graphs (MFGs) as computation dependency of the said minibatch, on
-    a distributed graph.
-
-    All the arguments have the same meaning as the single-machine counterpart
-    :class:`dgl.dataloading.DataLoader` except the first argument
-    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
-
-    Parameters
-    ----------
-    g : DistGraph
-        The distributed graph.
-
-    nids, graph_sampler, device, kwargs :
-        See :class:`dgl.dataloading.DataLoader`.
-
-    See also
-    --------
-    dgl.dataloading.DataLoader
-    """
-
-    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
-        collator_kwargs = {}
-        dataloader_kwargs = {}
-        _collator_arglist = inspect.getfullargspec(NodeCollator).args
-        for k, v in kwargs.items():
-            if k in _collator_arglist:
-                collator_kwargs[k] = v
-            else:
-                dataloader_kwargs[k] = v
-        if device is None:
-            # for the distributed case default to the CPU
-            device = "cpu"
-        assert (
-            device == "cpu"
-        ), "Only cpu is supported in the case of a DistGraph."
-        # Distributed DataLoader currently does not support heterogeneous graphs
-        # and does not copy features.  Fallback to normal solution
-        self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
-        _remove_kwargs_dist(dataloader_kwargs)
-        super().__init__(
-            self.collator.dataset,
-            collate_fn=self.collator.collate,
-            **dataloader_kwargs
-        )
-        self.device = device
-
-
-class DistEdgeDataLoader(DistDataLoader):
-    """Sampled graph data loader over edges for distributed graph storage.
-
-    It wraps an iterable over a set of edges, generating the list
-    of message flow graphs (MFGs) as computation dependency of the said minibatch for
-    edge classification, edge regression, and link prediction, on a distributed
-    graph.
-
-    All the arguments have the same meaning as the single-machine counterpart
-    :class:`dgl.dataloading.DataLoader` except the first argument
-    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
-
-    Parameters
-    ----------
-    g : DistGraph
-        The distributed graph.
-
-    eids, graph_sampler, device, kwargs :
-        See :class:`dgl.dataloading.DataLoader`.
-
-    See also
-    --------
-    dgl.dataloading.DataLoader
-    """
-
-    def __init__(self, g, eids, graph_sampler, device=None, **kwargs):
-        collator_kwargs = {}
-        dataloader_kwargs = {}
-        _collator_arglist = inspect.getfullargspec(EdgeCollator).args
-        for k, v in kwargs.items():
-            if k in _collator_arglist:
-                collator_kwargs[k] = v
-            else:
-                dataloader_kwargs[k] = v
-
-        if device is None:
-            # for the distributed case default to the CPU
-            device = "cpu"
-        assert (
-            device == "cpu"
-        ), "Only cpu is supported in the case of a DistGraph."
-        # Distributed DataLoader currently does not support heterogeneous graphs
-        # and does not copy features.  Fallback to normal solution
-        self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
-        _remove_kwargs_dist(dataloader_kwargs)
-        super().__init__(
-            self.collator.dataset,
-            collate_fn=self.collator.collate,
-            **dataloader_kwargs
-        )
-
-        self.device = device
diff --git a/python/dgl/distributed/__init__.py b/python/dgl/distributed/__init__.py
index 1e841e46cbd2..b99abf350fa7 100644
--- a/python/dgl/distributed/__init__.py
+++ b/python/dgl/distributed/__init__.py
@@ -1,7 +1,12 @@
 """DGL distributed module"""
+
 from . import optim
 from .dist_context import exit_client, initialize
-from .dist_dataloader import DistDataLoader
+from .dist_dataloader import (
+    DistDataLoader,
+    DistEdgeDataLoader,
+    DistNodeDataLoader,
+)
 from .dist_graph import DistGraph, DistGraphServer, edge_split, node_split
 from .dist_tensor import DistTensor
 from .graph_partition_book import GraphPartitionBook, PartitionPolicy
diff --git a/python/dgl/distributed/dist_dataloader.py b/python/dgl/distributed/dist_dataloader.py
index 19c7d31e638b..e225b3818deb 100644
--- a/python/dgl/distributed/dist_dataloader.py
+++ b/python/dgl/distributed/dist_dataloader.py
@@ -1,9 +1,15 @@
 # pylint: disable=global-variable-undefined, invalid-name
 """Multiprocess dataloader for distributed training"""
-from .. import backend as F
+import inspect
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+
+from .. import backend as F, transforms, utils
+from ..base import EID, NID
+from ..convert import heterograph
 from .dist_context import get_sampler_pool
 
-__all__ = ["DistDataLoader"]
+__all__ = ["DistDataLoader", "DistNodeDataLoader", "DistEdgeDataLoader"]
 
 DATALOADER_ID = 0
 
@@ -170,3 +176,696 @@ def _next_data(self):
             ret = [F.as_scalar(id) for id in ret]
         self.current_pos = end_pos
         return ret
+
+
+# [Note] As implementation of ``dgl.distributed.DistDataLoader`` is independent
+# of ``dgl.dataloading.DataLoader`` currently, dedicated collators are defined
+# here instead of using ``dgl.dataloading.CollateWrapper``.
+
+
+def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map):
+    if isinstance(eids, Mapping):
+        eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+        exclude_eids = {
+            k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0)
+            for k, v in eids.items()
+        }
+    else:
+        exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0)
+    return exclude_eids
+
+
+def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
+    exclude_eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+    reverse_etype_map = {
+        g.to_canonical_etype(k): g.to_canonical_etype(v)
+        for k, v in reverse_etype_map.items()
+    }
+    exclude_eids.update(
+        {reverse_etype_map[k]: v for k, v in exclude_eids.items()}
+    )
+    return exclude_eids
+
+
+def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
+    """Find all edge IDs to exclude according to :attr:`exclude_mode`.
+
+    Parameters
+    ----------
+    g : DGLGraph
+        The graph.
+    exclude_mode : str, optional
+        Can be either of the following,
+
+        None (default)
+            Does not exclude any edge.
+
+        'self'
+            Exclude the given edges themselves but nothing else.
+
+        'reverse_id'
+            Exclude all edges specified in ``eids``, as well as their reverse edges
+            of the same edge type.
+
+            The mapping from each edge ID to its reverse edge ID is specified in
+            the keyword argument ``reverse_eid_map``.
+
+            This mode assumes that the reverse of an edge with ID ``e`` and type
+            ``etype`` will have ID ``reverse_eid_map[e]`` and type ``etype``.
+
+        'reverse_types'
+            Exclude all edges specified in ``eids``, as well as their reverse
+            edges of the corresponding edge types.
+
+            The mapping from each edge type to its reverse edge type is specified
+            in the keyword argument ``reverse_etype_map``.
+
+            This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
+            will have ID ``e`` and type ``reverse_etype_map[etype]``.
+    eids : Tensor or dict[etype, Tensor]
+        The edge IDs.
+    reverse_eid_map : Tensor or dict[etype, Tensor]
+        The mapping from edge ID to its reverse edge ID.
+    reverse_etype_map : dict[etype, etype]
+        The mapping from edge etype to its reverse edge type.
+    """
+    if exclude_mode is None:
+        return None
+    elif exclude_mode == "self":
+        if isinstance(eids, Mapping):
+            eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+        return eids
+    elif exclude_mode == "reverse_id":
+        return _find_exclude_eids_with_reverse_id(
+            g, eids, kwargs["reverse_eid_map"]
+        )
+    elif exclude_mode == "reverse_types":
+        return _find_exclude_eids_with_reverse_types(
+            g, eids, kwargs["reverse_etype_map"]
+        )
+    else:
+        raise ValueError("unsupported mode {}".format(exclude_mode))
+
+
+class Collator(ABC):
+    """Abstract DGL collator for training GNNs on downstream tasks stochastically.
+
+    Provides a :attr:`dataset` object containing the collection of all nodes or edges,
+    as well as a :attr:`collate` method that combines a set of items from
+    :attr:`dataset` and obtains the message flow graphs (MFGs).
+
+    Notes
+    -----
+    For the concept of MFGs, please refer to
+    :ref:`User Guide Section 6 <guide-minibatch>` and
+    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+    """
+
+    @property
+    @abstractmethod
+    def dataset(self):
+        """Returns the dataset object of the collator."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def collate(self, items):
+        """Combines the items from the dataset object and obtains the list of MFGs.
+
+        Parameters
+        ----------
+        items : list[str, int]
+            The list of node or edge IDs or type-ID pairs.
+
+        Notes
+        -----
+        For the concept of MFGs, please refer to
+        :ref:`User Guide Section 6 <guide-minibatch>` and
+        :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def add_edge_attribute_to_graph(g, data_name):
+        """Add data into the graph as an edge attribute.
+
+        For some cases such as prob/mask-based sampling on GraphBolt partitions,
+        we need to prepare such data beforehand. This is because data are
+        usually saved in DistGraph.ndata/edata, but such data is not in the
+        format that GraphBolt partitions require. And in GraphBolt, such data
+        are saved as edge attributes. So we need to add such data into the graph
+        before any sampling is kicked off.
+
+        Parameters
+        ----------
+        g : DistGraph
+            The graph.
+        data_name : str
+            The name of data that's stored in DistGraph.ndata/edata.
+        """
+        if g._use_graphbolt and data_name:
+            g.add_edge_attribute(data_name)
+
+
+class NodeCollator(Collator):
+    """DGL collator to combine nodes and their computation dependencies within a minibatch for
+    training node classification or regression on a single graph with neighborhood sampling.
+
+    Parameters
+    ----------
+    g : DGLGraph
+        The graph.
+    nids : Tensor or dict[ntype, Tensor]
+        The node set to compute outputs.
+    graph_sampler : dgl.dataloading.BlockSampler
+        The neighborhood sampler.
+
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from all neighbors (assume
+    the backend is PyTorch):
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+    >>> collator = dgl.dataloading.NodeCollator(g, train_nid, sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, blocks in dataloader:
+    ...     train_on(input_nodes, output_nodes, blocks)
+
+    Notes
+    -----
+    For the concept of MFGs, please refer to
+    :ref:`User Guide Section 6 <guide-minibatch>` and
+    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+    """
+
+    def __init__(self, g, nids, graph_sampler):
+        self.g = g
+        if not isinstance(nids, Mapping):
+            assert (
+                len(g.ntypes) == 1
+            ), "nids should be a dict of node type and ids for graph with multiple node types"
+        self.graph_sampler = graph_sampler
+
+        self.nids = utils.prepare_tensor_or_dict(g, nids, "nids")
+        self._dataset = utils.maybe_flatten_dict(self.nids)
+
+        # Add prob/mask into graphbolt partition's edge attributes if needed.
+        Collator.add_edge_attribute_to_graph(self.g, self.graph_sampler.prob)
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    def collate(self, items):
+        """Find the list of MFGs necessary for computing the representation of given
+        nodes for a node classification/regression task.
+
+        Parameters
+        ----------
+        items : list[int] or list[tuple[str, int]]
+            Either a list of node IDs (for homogeneous graphs), or a list of node type-ID
+            pairs (for heterogeneous graphs).
+
+        Returns
+        -------
+        input_nodes : Tensor or dict[ntype, Tensor]
+            The input nodes necessary for computation in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        output_nodes : Tensor or dict[ntype, Tensor]
+            The nodes whose representations are to be computed in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        MFGs : list[DGLGraph]
+            The list of MFGs necessary for computing the representation.
+        """
+        if isinstance(items[0], tuple):
+            # returns a list of pairs: group them by node types into a dict
+            items = utils.group_as_dict(items)
+        items = utils.prepare_tensor_or_dict(self.g, items, "items")
+
+        input_nodes, output_nodes, blocks = self.graph_sampler.sample_blocks(
+            self.g, items
+        )
+
+        return input_nodes, output_nodes, blocks
+
+
+class EdgeCollator(Collator):
+    """DGL collator to combine edges and their computation dependencies within a minibatch for
+    training edge classification, edge regression, or link prediction on a single graph
+    with neighborhood sampling.
+
+    Given a set of edges, the collate function will yield
+
+    * A tensor of input nodes necessary for computing the representation on edges, or
+      a dictionary of node type names and such tensors.
+
+    * A subgraph that contains only the edges in the minibatch and their incident nodes.
+      Note that the graph has an identical metagraph with the original graph.
+
+    * If a negative sampler is given, another graph that contains the "negative edges",
+      connecting the source and destination nodes yielded from the given negative sampler.
+
+    * A list of MFGs necessary for computing the representation of the incident nodes
+      of the edges in the minibatch.
+
+    Parameters
+    ----------
+    g : DGLGraph
+        The graph from which the edges are iterated in minibatches and the subgraphs
+        are generated.
+    eids : Tensor or dict[etype, Tensor]
+        The edge set in graph :attr:`g` to compute outputs.
+    graph_sampler : dgl.dataloading.BlockSampler
+        The neighborhood sampler.
+    g_sampling : DGLGraph, optional
+        The graph where neighborhood sampling and message passing is performed.
+
+        Note that this is not necessarily the same as :attr:`g`.
+
+        If None, assume to be the same as :attr:`g`.
+    exclude : str, optional
+        Whether and how to exclude dependencies related to the sampled edges in the
+        minibatch.  Possible values are
+
+        * None, which excludes nothing.
+
+        * ``'self'``, which excludes the sampled edges themselves but nothing else.
+
+        * ``'reverse_id'``, which excludes the reverse edges of the sampled edges.  The said
+          reverse edges have the same edge type as the sampled edges.  Only works
+          on edge types whose source node type is the same as its destination node type.
+
+        * ``'reverse_types'``, which excludes the reverse edges of the sampled edges.  The
+          said reverse edges have different edge types from the sampled edges.
+
+        If ``g_sampling`` is given, ``exclude`` is ignored and will be always ``None``.
+    reverse_eids : Tensor or dict[etype, Tensor], optional
+        A tensor of reverse edge ID mapping.  The i-th element indicates the ID of
+        the i-th edge's reverse edge.
+
+        If the graph is heterogeneous, this argument requires a dictionary of edge
+        types and the reverse edge ID mapping tensors.
+
+        Required and only used when ``exclude`` is set to ``reverse_id``.
+
+        For heterogeneous graph this will be a dict of edge type and edge IDs.  Note that
+        only the edge types whose source node type is the same as destination node type
+        are needed.
+    reverse_etypes : dict[etype, etype], optional
+        The mapping from the edge type to its reverse edge type.
+
+        Required and only used when ``exclude`` is set to ``reverse_types``.
+    negative_sampler : callable, optional
+        The negative sampler.  Can be omitted if no negative sampling is needed.
+
+        The negative sampler must be a callable that takes in the following arguments:
+
+        * The original (heterogeneous) graph.
+
+        * The ID array of sampled edges in the minibatch, or the dictionary of edge
+          types and ID array of sampled edges in the minibatch if the graph is
+          heterogeneous.
+
+        It should return
+
+        * A pair of source and destination node ID arrays as negative samples,
+          or a dictionary of edge types and such pairs if the graph is heterogenenous.
+
+        A set of builtin negative samplers are provided in
+        :ref:`the negative sampling module <api-dataloading-negative-sampling>`.
+
+    Examples
+    --------
+    The following example shows how to train a 3-layer GNN for edge classification on a
+    set of edges ``train_eid`` on a homogeneous undirected graph. Each node takes
+    messages from all neighbors.
+
+    Say that you have an array of source node IDs ``src`` and another array of destination
+    node IDs ``dst``.  One can make it bidirectional by adding another set of edges
+    that connects from ``dst`` to ``src``:
+
+    >>> g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
+
+    One can then know that the ID difference of an edge and its reverse edge is ``|E|``,
+    where ``|E|`` is the length of your source/destination array.  The reverse edge
+    mapping can be obtained by
+
+    >>> E = len(src)
+    >>> reverse_eids = torch.cat([torch.arange(E, 2 * E), torch.arange(0, E)])
+
+    Note that the sampled edges as well as their reverse edges are removed from
+    computation dependencies of the incident nodes.  This is a common trick to avoid
+    information leakage.
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse_id',
+    ...     reverse_eids=reverse_eids)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` on a
+    homogeneous graph where each node takes messages from all neighbors (assume the
+    backend is PyTorch), with 5 uniformly chosen negative samples per edge:
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse_id',
+    ...     reverse_eids=reverse_eids, negative_sampler=neg_sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+
+    For heterogeneous graphs, the reverse of an edge may have a different edge type
+    from the original edge.  For instance, consider that you have an array of
+    user-item clicks, representated by a user array ``user`` and an item array ``item``.
+    You may want to build a heterogeneous graph with a user-click-item relation and an
+    item-clicked-by-user relation.
+
+    >>> g = dgl.heterograph({
+    ...     ('user', 'click', 'item'): (user, item),
+    ...     ('item', 'clicked-by', 'user'): (item, user)})
+
+    To train a 3-layer GNN for edge classification on a set of edges ``train_eid`` with
+    type ``click``, you can write
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, {'click': train_eid}, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'})
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` with type
+    ``click``, you can write
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
+    ...     negative_sampler=neg_sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, neg_pair_graph, blocks)
+
+    Notes
+    -----
+    For the concept of MFGs, please refer to
+    :ref:`User Guide Section 6 <guide-minibatch>` and
+    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+    """
+
+    def __init__(
+        self,
+        g,
+        eids,
+        graph_sampler,
+        g_sampling=None,
+        exclude=None,
+        reverse_eids=None,
+        reverse_etypes=None,
+        negative_sampler=None,
+    ):
+        self.g = g
+        if not isinstance(eids, Mapping):
+            assert (
+                len(g.etypes) == 1
+            ), "eids should be a dict of etype and ids for graph with multiple etypes"
+        self.graph_sampler = graph_sampler
+
+        # One may wish to iterate over the edges in one graph while perform sampling in
+        # another graph.  This may be the case for iterating over validation and test
+        # edge set while perform neighborhood sampling on the graph formed by only
+        # the training edge set.
+        # See GCMC for an example usage.
+        if g_sampling is not None:
+            self.g_sampling = g_sampling
+            self.exclude = None
+        else:
+            self.g_sampling = self.g
+            self.exclude = exclude
+
+        self.reverse_eids = reverse_eids
+        self.reverse_etypes = reverse_etypes
+        self.negative_sampler = negative_sampler
+
+        self.eids = utils.prepare_tensor_or_dict(g, eids, "eids")
+        self._dataset = utils.maybe_flatten_dict(self.eids)
+
+        # Add prob/mask into graphbolt partition's edge attributes if needed.
+        Collator.add_edge_attribute_to_graph(self.g, self.graph_sampler.prob)
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    def _collate(self, items):
+        if isinstance(items[0], tuple):
+            # returns a list of pairs: group them by node types into a dict
+            items = utils.group_as_dict(items)
+        items = utils.prepare_tensor_or_dict(self.g_sampling, items, "items")
+
+        pair_graph = self.g.edge_subgraph(items)
+        seed_nodes = pair_graph.ndata[NID]
+
+        exclude_eids = _find_exclude_eids(
+            self.g_sampling,
+            self.exclude,
+            items,
+            reverse_eid_map=self.reverse_eids,
+            reverse_etype_map=self.reverse_etypes,
+        )
+
+        input_nodes, _, blocks = self.graph_sampler.sample_blocks(
+            self.g_sampling, seed_nodes, exclude_eids=exclude_eids
+        )
+
+        return input_nodes, pair_graph, blocks
+
+    def _collate_with_negative_sampling(self, items):
+        if isinstance(items[0], tuple):
+            # returns a list of pairs: group them by node types into a dict
+            items = utils.group_as_dict(items)
+        items = utils.prepare_tensor_or_dict(self.g_sampling, items, "items")
+
+        pair_graph = self.g.edge_subgraph(items, relabel_nodes=False)
+        induced_edges = pair_graph.edata[EID]
+
+        neg_srcdst = self.negative_sampler(self.g, items)
+        if not isinstance(neg_srcdst, Mapping):
+            assert len(self.g.etypes) == 1, (
+                "graph has multiple or no edge types; "
+                "please return a dict in negative sampler."
+            )
+            neg_srcdst = {self.g.canonical_etypes[0]: neg_srcdst}
+        # Get dtype from a tuple of tensors
+        dtype = F.dtype(list(neg_srcdst.values())[0][0])
+        ctx = F.context(pair_graph)
+        neg_edges = {
+            etype: neg_srcdst.get(
+                etype,
+                (
+                    F.copy_to(F.tensor([], dtype), ctx),
+                    F.copy_to(F.tensor([], dtype), ctx),
+                ),
+            )
+            for etype in self.g.canonical_etypes
+        }
+        neg_pair_graph = heterograph(
+            neg_edges,
+            {ntype: self.g.num_nodes(ntype) for ntype in self.g.ntypes},
+        )
+
+        pair_graph, neg_pair_graph = transforms.compact_graphs(
+            [pair_graph, neg_pair_graph]
+        )
+        pair_graph.edata[EID] = induced_edges
+
+        seed_nodes = pair_graph.ndata[NID]
+
+        exclude_eids = _find_exclude_eids(
+            self.g_sampling,
+            self.exclude,
+            items,
+            reverse_eid_map=self.reverse_eids,
+            reverse_etype_map=self.reverse_etypes,
+        )
+
+        input_nodes, _, blocks = self.graph_sampler.sample_blocks(
+            self.g_sampling, seed_nodes, exclude_eids=exclude_eids
+        )
+
+        return input_nodes, pair_graph, neg_pair_graph, blocks
+
+    def collate(self, items):
+        """Combines the sampled edges into a minibatch for edge classification, edge
+        regression, and link prediction tasks.
+
+        Parameters
+        ----------
+        items : list[int] or list[tuple[str, int]]
+            Either a list of edge IDs (for homogeneous graphs), or a list of edge type-ID
+            pairs (for heterogeneous graphs).
+
+        Returns
+        -------
+        Either ``(input_nodes, pair_graph, blocks)``, or
+        ``(input_nodes, pair_graph, negative_pair_graph, blocks)`` if negative sampling is
+        enabled.
+
+        input_nodes : Tensor or dict[ntype, Tensor]
+            The input nodes necessary for computation in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        pair_graph : DGLGraph
+            The graph that contains only the edges in the minibatch as well as their incident
+            nodes.
+
+            Note that the metagraph of this graph will be identical to that of the original
+            graph.
+        negative_pair_graph : DGLGraph
+            The graph that contains only the edges connecting the source and destination nodes
+            yielded from the given negative sampler, if negative sampling is enabled.
+
+            Note that the metagraph of this graph will be identical to that of the original
+            graph.
+        blocks : list[DGLGraph]
+            The list of MFGs necessary for computing the representation of the edges.
+        """
+        if self.negative_sampler is None:
+            return self._collate(items)
+        else:
+            return self._collate_with_negative_sampling(items)
+
+
+def _remove_kwargs_dist(kwargs):
+    if "num_workers" in kwargs:
+        del kwargs["num_workers"]
+    if "pin_memory" in kwargs:
+        del kwargs["pin_memory"]
+        print("Distributed DataLoaders do not support pin_memory.")
+    return kwargs
+
+
+class DistNodeDataLoader(DistDataLoader):
+    """Sampled graph data loader over nodes for distributed graph storage.
+
+    It wraps an iterable over a set of nodes, generating the list
+    of message flow graphs (MFGs) as computation dependency of the said minibatch, on
+    a distributed graph.
+
+    All the arguments have the same meaning as the single-machine counterpart
+    :class:`dgl.dataloading.DataLoader` except the first argument
+    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
+
+    Parameters
+    ----------
+    g : DistGraph
+        The distributed graph.
+
+    nids, graph_sampler, device, kwargs :
+        See :class:`dgl.dataloading.DataLoader`.
+
+    See also
+    --------
+    dgl.dataloading.DataLoader
+    """
+
+    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        _collator_arglist = inspect.getfullargspec(NodeCollator).args
+        for k, v in kwargs.items():
+            if k in _collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+        if device is None:
+            # for the distributed case default to the CPU
+            device = "cpu"
+        assert (
+            device == "cpu"
+        ), "Only cpu is supported in the case of a DistGraph."
+        # Distributed DataLoader currently does not support heterogeneous graphs
+        # and does not copy features.  Fallback to normal solution
+        self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
+        _remove_kwargs_dist(dataloader_kwargs)
+        super().__init__(
+            self.collator.dataset,
+            collate_fn=self.collator.collate,
+            **dataloader_kwargs
+        )
+        self.device = device
+
+
+class DistEdgeDataLoader(DistDataLoader):
+    """Sampled graph data loader over edges for distributed graph storage.
+
+    It wraps an iterable over a set of edges, generating the list
+    of message flow graphs (MFGs) as computation dependency of the said minibatch for
+    edge classification, edge regression, and link prediction, on a distributed
+    graph.
+
+    All the arguments have the same meaning as the single-machine counterpart
+    :class:`dgl.dataloading.DataLoader` except the first argument
+    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
+
+    Parameters
+    ----------
+    g : DistGraph
+        The distributed graph.
+
+    eids, graph_sampler, device, kwargs :
+        See :class:`dgl.dataloading.DataLoader`.
+
+    See also
+    --------
+    dgl.dataloading.DataLoader
+    """
+
+    def __init__(self, g, eids, graph_sampler, device=None, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        _collator_arglist = inspect.getfullargspec(EdgeCollator).args
+        for k, v in kwargs.items():
+            if k in _collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+
+        if device is None:
+            # for the distributed case default to the CPU
+            device = "cpu"
+        assert (
+            device == "cpu"
+        ), "Only cpu is supported in the case of a DistGraph."
+        # Distributed DataLoader currently does not support heterogeneous graphs
+        # and does not copy features.  Fallback to normal solution
+        self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
+        _remove_kwargs_dist(dataloader_kwargs)
+        super().__init__(
+            self.collator.dataset,
+            collate_fn=self.collator.collate,
+            **dataloader_kwargs
+        )
+
+        self.device = device
diff --git a/tests/distributed/test_mp_dataloader.py b/tests/distributed/test_mp_dataloader.py
index 4d90cb7989be..e031481a8910 100644
--- a/tests/distributed/test_mp_dataloader.py
+++ b/tests/distributed/test_mp_dataloader.py
@@ -260,7 +260,7 @@ def start_dist_neg_dataloader(
     num_negs = 5
     sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10])
     negative_sampler = dgl.dataloading.negative_sampler.Uniform(num_negs)
-    dataloader = dgl.dataloading.DistEdgeDataLoader(
+    dataloader = dgl.distributed.DistEdgeDataLoader(
         dist_graph,
         train_eid,
         sampler,
@@ -457,6 +457,7 @@ def start_node_dataloader(
     use_graphbolt=False,
     return_eids=False,
     prob_or_mask=None,
+    use_deprecated_dataloader=False,
 ):
     dgl.distributed.initialize(ip_config, use_graphbolt=use_graphbolt)
     gpb = None
@@ -514,7 +515,12 @@ def start_node_dataloader(
     # We need to test creating DistDataLoader multiple times.
     for i in range(2):
         # Create DataLoader for constructing blocks
-        dataloader = dgl.dataloading.DistNodeDataLoader(
+        dataloader_cls = (
+            dgl.dataloading.DistNodeDataLoader
+            if use_deprecated_dataloader
+            else dgl.distributed.DistNodeDataLoader
+        )
+        dataloader = dataloader_cls(
             dist_graph,
             train_nid,
             sampler,
@@ -577,6 +583,7 @@ def start_edge_dataloader(
     reverse_etypes,
     negative,
     prob_or_mask,
+    use_deprecated_dataloader=False,
 ):
     dgl.distributed.initialize(ip_config, use_graphbolt=use_graphbolt)
     gpb = None
@@ -622,7 +629,12 @@ def start_edge_dataloader(
     # We need to test creating DistDataLoader multiple times.
     for i in range(2):
         # Create DataLoader for constructing blocks
-        dataloader = dgl.dataloading.DistEdgeDataLoader(
+        dataloader_cls = (
+            dgl.dataloading.DistEdgeDataLoader
+            if use_deprecated_dataloader
+            else dgl.distributed.DistEdgeDataLoader
+        )
+        dataloader = dataloader_cls(
             dist_graph,
             train_eid,
             sampler,
@@ -766,6 +778,7 @@ def check_dataloader(
     reverse_etypes=None,
     negative=False,
     prob_or_mask=None,
+    use_deprecated_dataloader=False,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         ip_config = "ip_config.txt"
@@ -827,6 +840,7 @@ def check_dataloader(
                     use_graphbolt,
                     return_eids,
                     prob_or_mask,
+                    use_deprecated_dataloader,
                 ),
             )
             p.start()
@@ -849,6 +863,7 @@ def check_dataloader(
                     reverse_etypes,
                     negative,
                     prob_or_mask,
+                    use_deprecated_dataloader,
                 ),
             )
             p.start()
@@ -1063,11 +1078,11 @@ def start_multiple_dataloaders(
     dl_iters = []
     for _ in range(num_dataloaders):
         if dataloader_type == "node":
-            dataloader = dgl.dataloading.DistNodeDataLoader(
+            dataloader = dgl.distributed.DistNodeDataLoader(
                 dist_g, train_ids, sampler, batch_size=batch_size
             )
         else:
-            dataloader = dgl.dataloading.DistEdgeDataLoader(
+            dataloader = dgl.distributed.DistEdgeDataLoader(
                 dist_g, train_ids, sampler, batch_size=batch_size
             )
         dataloaders.append(dataloader)
@@ -1150,3 +1165,16 @@ def test_multiple_dist_dataloaders(
             p.join()
             assert p.exitcode == 0
     reset_envs()
+
+
+@pytest.mark.parametrize("dataloader_type", ["node", "edge"])
+def test_deprecated_dataloader(dataloader_type):
+    reset_envs()
+    g = CitationGraphDataset("cora")[0]
+    check_dataloader(
+        g,
+        1,
+        0,
+        dataloader_type,
+        use_deprecated_dataloader=True,
+    )

From 03e83ac5a8a69a6528c91d57a940afd86c3a5cd1 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:59:06 +0800
Subject: [PATCH 60/78] [dev] make sure graphbolt is not imported together with
 dgl (#7756)

---
 tests/python/test_dgl_import.py | 10 ++++++++++
 tests/scripts/task_unit_test.sh |  1 +
 2 files changed, 11 insertions(+)
 create mode 100644 tests/python/test_dgl_import.py

diff --git a/tests/python/test_dgl_import.py b/tests/python/test_dgl_import.py
new file mode 100644
index 000000000000..3c4b5ab84e0c
--- /dev/null
+++ b/tests/python/test_dgl_import.py
@@ -0,0 +1,10 @@
+import sys
+
+
+def test_graphbolt_is_not_imported():
+    assert (
+        "dgl.graphbolt" not in sys.modules
+    ), "dgl.graphbolt is already imported"
+    import dgl
+
+    assert "dgl.graphbolt" not in sys.modules, "dgl.graphbolt is imported"
diff --git a/tests/scripts/task_unit_test.sh b/tests/scripts/task_unit_test.sh
index 2dae1594a4ba..5d74ffd84cc1 100644
--- a/tests/scripts/task_unit_test.sh
+++ b/tests/scripts/task_unit_test.sh
@@ -39,6 +39,7 @@ if [ $DGLBACKEND == "mxnet" ]
 then
   python3 -m pytest -v --junitxml=pytest_compute.xml --durations=100 --ignore=tests/python/common/test_ffi.py tests/python/common || fail "common"
 else
+  python3 -m pytest -v --junitxml=pytest_dgl_import.xml tests/python/test_dgl_import.py || fail "dgl_import"
   python3 -m pytest -v --junitxml=pytest_common.xml --durations=100 tests/python/common || fail "common"
 fi
 python3 -m pytest -v --junitxml=pytest_backend.xml --durations=100 tests/python/$DGLBACKEND || fail "backend-specific"

From d6cf415cbb7c3a7a9771e0b5b4f67457f977c638 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 29 Aug 2024 19:11:34 -0400
Subject: [PATCH 61/78] [GraphBolt][CUDA] Eliminate synchronization from
 exclude edges. (#7757)

---
 examples/graphbolt/link_prediction.py        |  5 +-
 examples/graphbolt/pyg/link_prediction.py    |  5 +-
 graphbolt/include/graphbolt/cuda_ops.h       | 14 ++++-
 graphbolt/include/graphbolt/isin.h           | 19 ++++++-
 graphbolt/src/cuda/isin.cu                   | 22 +++++++
 graphbolt/src/isin.cc                        | 17 ++++++
 graphbolt/src/python_binding.cc              |  2 +
 python/dgl/graphbolt/external_utils.py       | 60 +++++++++++++++++++-
 python/dgl/graphbolt/sampled_subgraph.py     | 40 +++++++++++--
 tests/python/pytorch/graphbolt/test_utils.py | 21 +++++--
 10 files changed, 189 insertions(+), 16 deletions(-)

diff --git a/examples/graphbolt/link_prediction.py b/examples/graphbolt/link_prediction.py
index 60d48b57fc2d..cdd7440901f1 100644
--- a/examples/graphbolt/link_prediction.py
+++ b/examples/graphbolt/link_prediction.py
@@ -202,8 +202,9 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
     # the negative samples.
     ############################################################################
     if is_train and args.exclude_edges:
-        datapipe = datapipe.transform(
-            partial(gb.exclude_seed_edges, include_reverse_edges=True)
+        datapipe = datapipe.exclude_seed_edges(
+            include_reverse_edges=True,
+            asynchronous=args.storage_device != "cpu",
         )
 
     ############################################################################
diff --git a/examples/graphbolt/pyg/link_prediction.py b/examples/graphbolt/pyg/link_prediction.py
index 4c2b05fd410e..5dc782d9ff4d 100644
--- a/examples/graphbolt/pyg/link_prediction.py
+++ b/examples/graphbolt/pyg/link_prediction.py
@@ -163,8 +163,9 @@ def create_dataloader(
         asynchronous=args.graph_device != "cpu",
     )
     if job == "train" and args.exclude_edges:
-        datapipe = datapipe.transform(
-            partial(gb.exclude_seed_edges, include_reverse_edges=True)
+        datapipe = datapipe.exclude_seed_edges(
+            include_reverse_edges=True,
+            asynchronous=args.graph_device != "cpu",
         )
     # Copy the data to the specified device.
     if args.feature_device != "cpu" and need_copy:
diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index e7f2f60721b4..91cd1a10c652 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -79,10 +79,22 @@ Sort(torch::Tensor input, int num_bits = 0);
  * @return
  * A boolean tensor of the same shape as elements that is True for elements
  * in test_elements and False otherwise.
- *
  */
 torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements);
 
+/**
+ * @brief Returns the indexes of the nonzero elements in the given boolean mask
+ * if logical_not is false. Otherwise, returns the indexes of the zero elements
+ * instead.
+ *
+ * @param mask        Input boolean mask.
+ * @param logical_not Whether mask should be treated as ~mask.
+ *
+ * @return An int64_t tensor of the same shape as mask containing the indexes
+ * of the selected elements.
+ */
+torch::Tensor Nonzero(torch::Tensor mask, bool logical_not);
+
 /**
  * @brief Select columns for a sparse matrix in a CSC format according to nodes
  * tensor.
diff --git a/graphbolt/include/graphbolt/isin.h b/graphbolt/include/graphbolt/isin.h
index 0b472858ecf6..4e52b429988f 100644
--- a/graphbolt/include/graphbolt/isin.h
+++ b/graphbolt/include/graphbolt/isin.h
@@ -7,6 +7,7 @@
 #ifndef GRAPHBOLT_ISIN_H_
 #define GRAPHBOLT_ISIN_H_
 
+#include <graphbolt/async.h>
 #include <torch/torch.h>
 
 namespace graphbolt {
@@ -25,11 +26,27 @@ namespace sampling {
  * @return
  * A boolean tensor of the same shape as elements that is True for elements
  * in test_elements and False otherwise.
- *
  */
 torch::Tensor IsIn(
     const torch::Tensor& elements, const torch::Tensor& test_elements);
 
+/**
+ * @brief Tests if each element of elements is not in test_elements. Returns an
+ * int64_t tensor of the same shape as elements containing the indexes of the
+ * elements not found in test_elements.
+ *
+ * @param elements        Input elements
+ * @param test_elements   Values against which to test for each input element.
+ *
+ * @return An int64_t tensor of the same shape as elements containing indexes of
+ * elements not found in test_elements.
+ */
+torch::Tensor IsNotInIndex(
+    const torch::Tensor& elements, const torch::Tensor& test_elements);
+
+c10::intrusive_ptr<Future<torch::Tensor>> IsNotInIndexAsync(
+    const torch::Tensor& elements, const torch::Tensor& test_elements);
+
 }  // namespace sampling
 }  // namespace graphbolt
 
diff --git a/graphbolt/src/cuda/isin.cu b/graphbolt/src/cuda/isin.cu
index af773934415e..aa2c724d2535 100644
--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
@@ -20,6 +20,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>
 
+#include <cub/cub.cuh>
+
 #include "./common.h"
 
 namespace graphbolt {
@@ -42,5 +44,25 @@ torch::Tensor IsIn(torch::Tensor elements, torch::Tensor test_elements) {
   return result;
 }
 
+torch::Tensor Nonzero(torch::Tensor mask, bool logical_not) {
+  thrust::counting_iterator<int64_t> iota(0);
+  auto result = torch::empty_like(mask, torch::kInt64);
+  auto mask_ptr = mask.data_ptr<bool>();
+  auto result_ptr = result.data_ptr<int64_t>();
+  auto allocator = cuda::GetAllocator();
+  auto num_copied = allocator.AllocateStorage<int64_t>(1);
+  if (logical_not) {
+    CUB_CALL(
+        DeviceSelect::FlaggedIf, iota, mask_ptr, result_ptr, num_copied.get(),
+        mask.numel(), thrust::logical_not<bool>{});
+  } else {
+    CUB_CALL(
+        DeviceSelect::Flagged, iota, mask_ptr, result_ptr, num_copied.get(),
+        mask.numel());
+  }
+  cuda::CopyScalar num_copied_cpu(num_copied.get());
+  return result.slice(0, 0, static_cast<int64_t>(num_copied_cpu));
+}
+
 }  // namespace ops
 }  // namespace graphbolt
diff --git a/graphbolt/src/isin.cc b/graphbolt/src/isin.cc
index c41b839b1651..76cbf1f8d0f1 100644
--- a/graphbolt/src/isin.cc
+++ b/graphbolt/src/isin.cc
@@ -56,5 +56,22 @@ torch::Tensor IsIn(
     return IsInCPU(elements, test_elements);
   }
 }
+
+torch::Tensor IsNotInIndex(
+    const torch::Tensor& elements, const torch::Tensor& test_elements) {
+  auto mask = IsIn(elements, test_elements);
+  if (utils::is_on_gpu(mask)) {
+    GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
+        c10::DeviceType::CUDA, "NonzeroOperation",
+        { return ops::Nonzero(mask, true); });
+  }
+  return torch::nonzero(torch::logical_not(mask)).squeeze(1);
+}
+
+c10::intrusive_ptr<Future<torch::Tensor>> IsNotInIndexAsync(
+    const torch::Tensor& elements, const torch::Tensor& test_elements) {
+  return async([=] { return IsNotInIndex(elements, test_elements); });
+}
+
 }  // namespace sampling
 }  // namespace graphbolt
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 4df395b0f904..ea2b543761cf 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -181,6 +181,8 @@ TORCH_LIBRARY(graphbolt, m) {
   m.def("unique_and_compact_batched", &UniqueAndCompactBatched);
   m.def("unique_and_compact_batched_async", &UniqueAndCompactBatchedAsync);
   m.def("isin", &IsIn);
+  m.def("is_not_in_index", &IsNotInIndex);
+  m.def("is_not_in_index_async", &IsNotInIndexAsync);
   m.def("index_select", &ops::IndexSelect);
   m.def("index_select_async", &ops::IndexSelectAsync);
   m.def("scatter_async", &ops::ScatterAsync);
diff --git a/python/dgl/graphbolt/external_utils.py b/python/dgl/graphbolt/external_utils.py
index 98ddc310d213..89737fbe6dc3 100644
--- a/python/dgl/graphbolt/external_utils.py
+++ b/python/dgl/graphbolt/external_utils.py
@@ -1,10 +1,60 @@
 """Utility functions for external use."""
-
+from functools import partial
 from typing import Dict, Union
 
 import torch
 
+from torch.utils.data import functional_datapipe
+
 from .minibatch import MiniBatch
+from .minibatch_transformer import MiniBatchTransformer
+
+
+@functional_datapipe("exclude_seed_edges")
+class SeedEdgesExcluder(MiniBatchTransformer):
+    """A mini-batch transformer used to manipulate mini-batch.
+
+    Functional name: :obj:`transform`.
+
+    Parameters
+    ----------
+    datapipe : DataPipe
+        The datapipe.
+    include_reverse_edges : bool
+        Whether reverse edges should be excluded as well. Default is False.
+    reverse_etypes_mapping : Dict[str, str] = None
+        The mapping from the original edge types to their reverse edge types.
+    asynchronous: bool
+        Boolean indicating whether edge exclusion stages should run on
+        background threads to hide the latency of CPU GPU synchronization.
+        Should be enabled only when sampling on the GPU.
+    """
+
+    def __init__(
+        self,
+        datapipe,
+        include_reverse_edges: bool = False,
+        reverse_etypes_mapping: Dict[str, str] = None,
+        asynchronous=False,
+    ):
+        exclude_seed_edges_fn = partial(
+            exclude_seed_edges,
+            include_reverse_edges=include_reverse_edges,
+            reverse_etypes_mapping=reverse_etypes_mapping,
+            async_op=asynchronous,
+        )
+        datapipe = datapipe.transform(exclude_seed_edges_fn)
+        if asynchronous:
+            datapipe = datapipe.buffer()
+            datapipe = datapipe.transform(self._wait_for_sampled_subgraphs)
+        super().__init__(datapipe)
+
+    @staticmethod
+    def _wait_for_sampled_subgraphs(minibatch):
+        minibatch.sampled_subgraphs = [
+            subgraph.wait() for subgraph in minibatch.sampled_subgraphs
+        ]
+        return minibatch
 
 
 def add_reverse_edges(
@@ -79,6 +129,7 @@ def exclude_seed_edges(
     minibatch: MiniBatch,
     include_reverse_edges: bool = False,
     reverse_etypes_mapping: Dict[str, str] = None,
+    async_op: bool = False,
 ):
     """
     Exclude seed edges with or without their reverse edges from the sampled
@@ -88,8 +139,13 @@ def exclude_seed_edges(
     ----------
     minibatch : MiniBatch
         The minibatch.
+    include_reverse_edges : bool
+        Whether reverse edges should be excluded as well. Default is False.
     reverse_etypes_mapping : Dict[str, str] = None
         The mapping from the original edge types to their reverse edge types.
+    async_op: bool
+        Boolean indicating whether the call is asynchronous. If so, the result
+        can be obtained by calling wait on the modified sampled_subgraphs.
     """
     edges_to_exclude = minibatch.seeds
     if include_reverse_edges:
@@ -97,7 +153,7 @@ def exclude_seed_edges(
             edges_to_exclude, reverse_etypes_mapping
         )
     minibatch.sampled_subgraphs = [
-        subgraph.exclude_edges(edges_to_exclude)
+        subgraph.exclude_edges(edges_to_exclude, async_op=async_op)
         for subgraph in minibatch.sampled_subgraphs
     ]
     return minibatch
diff --git a/python/dgl/graphbolt/sampled_subgraph.py b/python/dgl/graphbolt/sampled_subgraph.py
index 8bff77de90b3..bcbd8a2004a1 100644
--- a/python/dgl/graphbolt/sampled_subgraph.py
+++ b/python/dgl/graphbolt/sampled_subgraph.py
@@ -20,6 +20,27 @@
 __all__ = ["SampledSubgraph"]
 
 
+class _ExcludeEdgesWaiter:
+    def __init__(self, sampled_subgraph, index):
+        self.sampled_subgraph = sampled_subgraph
+        self.index = index
+
+    def wait(self):
+        """Returns the stored value when invoked."""
+        sampled_subgraph = self.sampled_subgraph
+        index = self.index
+        # Ensure there is no memory leak.
+        self.sampled_subgraph = self.index = None
+
+        if isinstance(index, dict):
+            for k in list(index.keys()):
+                index[k] = index[k].wait()
+        else:
+            index = index.wait()
+
+        return type(sampled_subgraph)(*_slice_subgraph(sampled_subgraph, index))
+
+
 class PyGLayerData(NamedTuple):
     """A named tuple class to represent homogenous inputs to a PyG model layer.
     The fields are x (input features), edge_index and size
@@ -142,6 +163,7 @@ def exclude_edges(
             torch.Tensor,
         ],
         assume_num_node_within_int32: bool = True,
+        async_op: bool = False,
     ):
         r"""Exclude edges from the sampled subgraph.
 
@@ -163,6 +185,9 @@ def exclude_edges(
             If True, assumes the value of node IDs in the provided `edges` fall
             within the int32 range, which can significantly enhance computation
             speed. Default: True
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -222,9 +247,8 @@ def exclude_edges(
                 self.original_column_node_ids,
             )
             index = _exclude_homo_edges(
-                reverse_edges, edges, assume_num_node_within_int32
+                reverse_edges, edges, assume_num_node_within_int32, async_op
             )
-            return calling_class(*_slice_subgraph(self, index))
         else:
             index = {}
             for etype, pair in self.sampled_csc.items():
@@ -252,7 +276,11 @@ def exclude_edges(
                     reverse_edges,
                     edges[etype],
                     assume_num_node_within_int32,
+                    async_op,
                 )
+        if async_op:
+            return _ExcludeEdgesWaiter(self, index)
+        else:
             return calling_class(*_slice_subgraph(self, index))
 
     def to_pyg(
@@ -367,6 +395,7 @@ def _exclude_homo_edges(
     edges: Tuple[torch.Tensor, torch.Tensor],
     edges_to_exclude: torch.Tensor,
     assume_num_node_within_int32: bool,
+    async_op: bool,
 ):
     """Return the indices of edges to be included."""
     if assume_num_node_within_int32:
@@ -381,8 +410,11 @@ def _exclude_homo_edges(
         raise NotImplementedError(
             "Values out of range int32 are not supported yet"
         )
-    mask = ~isin(val, val_to_exclude)
-    return torch.nonzero(mask, as_tuple=True)[0]
+    if async_op:
+        return torch.ops.graphbolt.is_not_in_index_async(val, val_to_exclude)
+    else:
+        mask = ~isin(val, val_to_exclude)
+        return torch.nonzero(mask, as_tuple=True)[0]
 
 
 def _slice_subgraph(subgraph: SampledSubgraph, index: torch.Tensor):
diff --git a/tests/python/pytorch/graphbolt/test_utils.py b/tests/python/pytorch/graphbolt/test_utils.py
index 3795c791ebde..149942f98ea9 100644
--- a/tests/python/pytorch/graphbolt/test_utils.py
+++ b/tests/python/pytorch/graphbolt/test_utils.py
@@ -72,7 +72,8 @@ def test_add_reverse_edges_hetero():
     F._default_context_str == "gpu",
     reason="Fails due to different result on the GPU.",
 )
-def test_exclude_seed_edges_homo_cpu():
+@pytest.mark.parametrize("use_datapipe", [False, True])
+def test_exclude_seed_edges_homo_cpu(use_datapipe):
     graph = dgl.graph(([5, 0, 6, 7, 2, 2, 4], [0, 1, 2, 2, 3, 4, 4]))
     graph = gb.from_dglgraph(graph, True).to(F.ctx())
     items = torch.LongTensor([[0, 3], [4, 4]])
@@ -83,7 +84,10 @@ def test_exclude_seed_edges_homo_cpu():
     fanouts = [torch.LongTensor([2]) for _ in range(num_layer)]
     sampler = gb.NeighborSampler
     datapipe = sampler(datapipe, graph, fanouts)
-    datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    if use_datapipe:
+        datapipe = datapipe.exclude_seed_edges()
+    else:
+        datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
     original_row_node_ids = [
         torch.tensor([0, 3, 4, 5, 2, 6, 7]).to(F.ctx()),
         torch.tensor([0, 3, 4, 5, 2]).to(F.ctx()),
@@ -121,7 +125,9 @@ def test_exclude_seed_edges_homo_cpu():
     F._default_context_str == "cpu",
     reason="Fails due to different result on the CPU.",
 )
-def test_exclude_seed_edges_gpu():
+@pytest.mark.parametrize("use_datapipe", [False, True])
+@pytest.mark.parametrize("async_op", [False, True])
+def test_exclude_seed_edges_gpu(use_datapipe, async_op):
     graph = dgl.graph(([5, 0, 7, 7, 2, 4], [0, 1, 2, 2, 3, 4]))
     graph = gb.from_dglgraph(graph, is_homogeneous=True).to(F.ctx())
     items = torch.LongTensor([[0, 3], [4, 4]])
@@ -137,7 +143,12 @@ def test_exclude_seed_edges_gpu():
         fanouts,
         deduplicate=True,
     )
-    datapipe = datapipe.transform(partial(gb.exclude_seed_edges))
+    if use_datapipe:
+        datapipe = datapipe.exclude_seed_edges(asynchronous=async_op)
+    else:
+        datapipe = datapipe.transform(
+            partial(gb.exclude_seed_edges, async_op=async_op)
+        )
     if torch.cuda.get_device_capability()[0] < 7:
         original_row_node_ids = [
             torch.tensor([0, 3, 4, 2, 5, 7]).to(F.ctx()),
@@ -174,6 +185,8 @@ def test_exclude_seed_edges_gpu():
         ]
     for data in datapipe:
         for step, sampled_subgraph in enumerate(data.sampled_subgraphs):
+            if async_op and not use_datapipe:
+                sampled_subgraph = sampled_subgraph.wait()
             assert torch.equal(
                 sampled_subgraph.original_row_node_ids,
                 original_row_node_ids[step],

From b4bd5092266e8c6aeed7739ff2965dceac5e3ceb Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:15:29 -0500
Subject: [PATCH 62/78] [Dataset] Contribute IGB-Homo dataset to
 node_classification.py (#7717)

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/node_classification.py   | 12 ++++++++++--
 python/dgl/graphbolt/impl/ondisk_dataset.py | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index a4a8be298d2c..6b0dee719746 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -363,9 +363,17 @@ def parse_args():
         "--dataset",
         type=str,
         default="ogbn-products",
-        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
+        choices=[
+            "ogbn-arxiv",
+            "ogbn-products",
+            "ogbn-papers100M",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
+        ],
         help="The dataset we can use for node classification example. Currently"
-        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
+        " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
+        " igb-hom-[tiny|small|medium] datasets are supported.",
     )
     parser.add_argument(
         "--mode",
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index d669dc825509..df3b51f8b074 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -979,6 +979,16 @@ class BuiltinDataset(OnDiskDataset):
         .. note::
             Reverse edges are added to the original graph.
 
+    **igb-hom-[tiny|small|medium]**
+        The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
+        which is designed for developers to train and evaluate GNN models with
+        high fidelity. See more details in `igb-hom-[tiny|small|medium]
+        <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
+
+        .. note::
+            Self edges are added to the original graph.
+            Node features are stored as float32.
+
     Parameters
     ----------
     name : str
@@ -1004,12 +1014,18 @@ class BuiltinDataset(OnDiskDataset):
         "ogbn-products-seeds",
         "ogbn-arxiv",
         "ogbn-arxiv-seeds",
+        "igb-hom-tiny",
+        "igb-hom-tiny-seeds",
+        "igb-hom-small",
+        "igb-hom-small-seeds",
     ]
     _large_datasets = [
         "ogb-lsc-mag240m",
         "ogb-lsc-mag240m-seeds",
         "ogbn-papers100M",
         "ogbn-papers100M-seeds",
+        "igb-hom-medium",
+        "igb-hom-medium-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 647d8ea8159c24d65ed6268f6a9085107674ecf5 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 29 Aug 2024 19:52:56 -0400
Subject: [PATCH 63/78] [GraphBolt][PyG] Add igb datasets to the examples.
 (#7758)

---
 examples/graphbolt/pyg/labor/load_dataset.py         |  9 +--------
 examples/graphbolt/pyg/labor/node_classification.py  |  3 +++
 .../graphbolt/pyg/node_classification_advanced.py    | 12 ++++++++++--
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/examples/graphbolt/pyg/labor/load_dataset.py b/examples/graphbolt/pyg/labor/load_dataset.py
index 0f9f24da4fc4..c354b91f6692 100644
--- a/examples/graphbolt/pyg/labor/load_dataset.py
+++ b/examples/graphbolt/pyg/labor/load_dataset.py
@@ -39,12 +39,7 @@ def load_dataset(dataset_name, disk_based_feature_keys=None):
         "flickr",
     ]:
         dataset, multilabel = load_dgl(dataset_name)
-    elif dataset_name in [
-        "ogbn-products",
-        "ogbn-arxiv",
-        "ogbn-papers100M",
-        "ogbn-mag240M",
-    ]:
+    else:
         if "mag240M" in dataset_name:
             dataset_name = "ogb-lsc-mag240m"
         dataset = gb.BuiltinDataset(dataset_name)
@@ -56,7 +51,5 @@ def load_dataset(dataset_name, disk_based_feature_keys=None):
             if feature_key in disk_based_feature_keys:
                 feature["in_memory"] = False
         dataset = dataset.load()
-    else:
-        raise ValueError("unknown dataset")
 
     return dataset, multilabel
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index f6636d2721b7..7129d517c8c6 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -363,6 +363,9 @@ def parse_args():
             "ogbn-arxiv",
             "ogbn-products",
             "ogbn-papers100M",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
             "reddit",
             "yelp",
             "flickr",
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 02df19f6fd38..576335b5bdf1 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -335,9 +335,17 @@ def parse_args():
         "--dataset",
         type=str,
         default="ogbn-products",
-        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
+        choices=[
+            "ogbn-arxiv",
+            "ogbn-products",
+            "ogbn-papers100M",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
+        ],
         help="The dataset we can use for node classification example. Currently"
-        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
+        " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
+        " igb-hom-[tiny|small|medium] datasets are supported.",
     )
     parser.add_argument(
         "--fanout",

From a71946539f6a890bdd7f0fd7606a44ce2c4ac241 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:56:52 -0500
Subject: [PATCH 64/78] [Graphbolt] Add igb-hom-[tiny|small|medium] to
 disk_based_feature (#7759)

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 .../graphbolt/disk_based_feature/node_classification.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index 83b2f4af45d1..450a61136593 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -336,9 +336,9 @@ def parse_args():
             "ogbn-arxiv",
             "ogbn-products",
             "ogbn-papers100M",
-            "reddit",
-            "yelp",
-            "flickr",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
         ],
     )
     parser.add_argument("--root", type=str, default="datasets")

From bb02829b6aa3b850e92dfd021df4b10692d5c1f6 Mon Sep 17 00:00:00 2001
From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com>
Date: Fri, 30 Aug 2024 10:18:16 -0700
Subject: [PATCH 65/78] Correcting the misleading reason for skipping the test.
 (#7749)

---
 tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py b/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
index e6034cf77019..6c6d242f14c1 100644
--- a/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
+++ b/tests/python/pytorch/graphbolt/impl/test_gpu_graph_cache.py
@@ -11,7 +11,9 @@
 @unittest.skipIf(
     F._default_context_str != "gpu"
     or torch.cuda.get_device_capability()[0] < 7,
-    reason="GPUCachedFeature requires a Volta or later generation NVIDIA GPU.",
+    reason="GPUCachedFeature tests are available only on GPU."
+    if F._default_context_str != "gpu"
+    else "GPUCachedFeature requires a Volta or later generation NVIDIA GPU.",
 )
 @pytest.mark.parametrize(
     "indptr_dtype",

From 694ef6518ccd5bb19a01312c47c96a517989003a Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Mon, 2 Sep 2024 10:55:48 +0800
Subject: [PATCH 66/78] [Dist] import dgl.distributed for examples (#7761)

---
 examples/distributed/graphsage/node_classification.py           | 1 +
 .../distributed/graphsage/node_classification_unsupervised.py   | 1 +
 examples/distributed/rgcn/node_classification.py                | 1 +
 examples/pytorch/graphsage/dist/train_dist.py                   | 1 +
 examples/pytorch/graphsage/dist/train_dist_transductive.py      | 2 +-
 examples/pytorch/graphsage/dist/train_dist_unsupervised.py      | 1 +
 .../graphsage/dist/train_dist_unsupervised_transductive.py      | 2 +-
 examples/pytorch/rgcn/experimental/entity_classify_dist.py      | 1 +
 8 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/distributed/graphsage/node_classification.py b/examples/distributed/graphsage/node_classification.py
index 4ee21985d936..d7b15127711b 100644
--- a/examples/distributed/graphsage/node_classification.py
+++ b/examples/distributed/graphsage/node_classification.py
@@ -3,6 +3,7 @@
 import time
 
 import dgl
+import dgl.distributed
 import dgl.nn.pytorch as dglnn
 import numpy as np
 import torch as th
diff --git a/examples/distributed/graphsage/node_classification_unsupervised.py b/examples/distributed/graphsage/node_classification_unsupervised.py
index c940d1a9570a..4b71f599064b 100644
--- a/examples/distributed/graphsage/node_classification_unsupervised.py
+++ b/examples/distributed/graphsage/node_classification_unsupervised.py
@@ -3,6 +3,7 @@
 from contextlib import contextmanager
 
 import dgl
+import dgl.distributed
 import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 
diff --git a/examples/distributed/rgcn/node_classification.py b/examples/distributed/rgcn/node_classification.py
index 7bf74f83ecd5..f4a6742db36a 100644
--- a/examples/distributed/rgcn/node_classification.py
+++ b/examples/distributed/rgcn/node_classification.py
@@ -19,6 +19,7 @@
 from functools import partial
 
 import dgl
+import dgl.distributed
 import torch as th
 import torch.multiprocessing as mp
 import torch.nn as nn
diff --git a/examples/pytorch/graphsage/dist/train_dist.py b/examples/pytorch/graphsage/dist/train_dist.py
index d2f192ac4f99..415ff192dbcb 100644
--- a/examples/pytorch/graphsage/dist/train_dist.py
+++ b/examples/pytorch/graphsage/dist/train_dist.py
@@ -4,6 +4,7 @@
 from contextlib import contextmanager
 
 import dgl
+import dgl.distributed
 import dgl.nn.pytorch as dglnn
 
 import numpy as np
diff --git a/examples/pytorch/graphsage/dist/train_dist_transductive.py b/examples/pytorch/graphsage/dist/train_dist_transductive.py
index 903b833b8de9..b401d943c099 100644
--- a/examples/pytorch/graphsage/dist/train_dist_transductive.py
+++ b/examples/pytorch/graphsage/dist/train_dist_transductive.py
@@ -2,7 +2,7 @@
 import time
 
 import dgl
-
+import dgl.distributed
 import numpy as np
 import torch as th
 import torch.nn as nn
diff --git a/examples/pytorch/graphsage/dist/train_dist_unsupervised.py b/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
index a28900076a6e..f3eb1a24305b 100644
--- a/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
+++ b/examples/pytorch/graphsage/dist/train_dist_unsupervised.py
@@ -3,6 +3,7 @@
 from contextlib import contextmanager
 
 import dgl
+import dgl.distributed
 import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 
diff --git a/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py b/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
index 0eb4345501ee..2260cb61da02 100644
--- a/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
+++ b/examples/pytorch/graphsage/dist/train_dist_unsupervised_transductive.py
@@ -2,7 +2,7 @@
 import time
 
 import dgl
-
+import dgl.distributed
 import numpy as np
 import torch as th
 import torch.nn.functional as F
diff --git a/examples/pytorch/rgcn/experimental/entity_classify_dist.py b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
index 9e1b74c02cf6..f47deb8bde44 100644
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -19,6 +19,7 @@
 from functools import partial
 
 import dgl
+import dgl.distributed
 import torch as th
 import torch.multiprocessing as mp
 import torch.nn as nn

From 6cd7fe99c9f21c4fe132a5ecc37ecd4151b65c5c Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 2 Sep 2024 01:21:09 -0400
Subject: [PATCH 67/78] [release] bump version to 2.5 for nightly (#7762)

---
 conda/dgl/meta.yaml                 | 2 +-
 include/dgl/runtime/c_runtime_api.h | 2 +-
 python/dgl/_ffi/libinfo.py          | 2 +-
 python/update_version.py            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/dgl/meta.yaml b/conda/dgl/meta.yaml
index 3edb734286ca..d1bbef1d5767 100644
--- a/conda/dgl/meta.yaml
+++ b/conda/dgl/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: dgl{{ environ.get('DGL_PACKAGE_SUFFIX', '') }}
-  version: 2.4{{ environ.get('DGL_VERSION_SUFFIX', '') }}
+  version: 2.5{{ environ.get('DGL_VERSION_SUFFIX', '') }}
 
 source:
   git_rev: {{ environ.get('DGL_RELEASE_BRANCH', 'master') }}
diff --git a/include/dgl/runtime/c_runtime_api.h b/include/dgl/runtime/c_runtime_api.h
index c3098878362d..6b935f56f857 100644
--- a/include/dgl/runtime/c_runtime_api.h
+++ b/include/dgl/runtime/c_runtime_api.h
@@ -33,7 +33,7 @@
 #endif
 
 // DGL version
-#define DGL_VERSION "2.4"
+#define DGL_VERSION "2.5"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/python/dgl/_ffi/libinfo.py b/python/dgl/_ffi/libinfo.py
index b8d560e207ce..720f07bdb417 100644
--- a/python/dgl/_ffi/libinfo.py
+++ b/python/dgl/_ffi/libinfo.py
@@ -105,4 +105,4 @@ def find_lib_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by dgl/python/update_version.py
-__version__ = "2.4"
+__version__ = "2.5"
diff --git a/python/update_version.py b/python/update_version.py
index 185f6c74d196..34286939153d 100644
--- a/python/update_version.py
+++ b/python/update_version.py
@@ -16,7 +16,7 @@
 # (usually "aYYMMDD")
 # The environment variable DGL_VERSION_SUFFIX is the local version label
 # suffix for indicating CPU and CUDA versions as in PEP 440 (e.g. "+cu102")
-__version__ = "2.4" + os.getenv("DGL_PRERELEASE", "")
+__version__ = "2.5" + os.getenv("DGL_PRERELEASE", "")
 __version__ += os.getenv("DGL_VERSION_SUFFIX", "")
 print(__version__)
 

From 9a86a665ef712e66e3c55b842d463ee09c2425b8 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Mon, 2 Sep 2024 17:50:08 -0400
Subject: [PATCH 68/78] [GraphBolt][CUDA] Remove unused `num_bits` argument
 from `UniqueAndCompact` (#7764)

---
 graphbolt/include/graphbolt/cuda_ops.h        |  4 ++--
 graphbolt/src/cuda/unique_and_compact_impl.cu | 12 +++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index 91cd1a10c652..7ac22e178418 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -299,7 +299,7 @@ torch::Tensor IndptrEdgeIdsImpl(
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
-    const torch::Tensor unique_dst_ids, int num_bits = 0);
+    const torch::Tensor unique_dst_ids);
 
 /**
  * @brief Batched version of UniqueAndCompact. The ith element of the return
@@ -310,7 +310,7 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids, int num_bits = 0);
+    const std::vector<torch::Tensor>& unique_dst_ids);
 
 }  //  namespace ops
 }  //  namespace graphbolt
diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu
index 71ecfe3f553f..5407777daeea 100644
--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
@@ -61,7 +61,7 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatchedSortBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids, int num_bits) {
+    const std::vector<torch::Tensor>& unique_dst_ids, int num_bits = 0) {
   auto allocator = cuda::GetAllocator();
   auto stream = cuda::GetCurrentStream();
   auto scalar_type = src_ids.at(0).scalar_type();
@@ -276,7 +276,7 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids, int num_bits) {
+    const std::vector<torch::Tensor>& unique_dst_ids) {
   if (cuda::compute_capability() >= 70) {
     // Utilizes a hash table based implementation, the mapped id of a vertex
     // will be monotonically increasing as the first occurrence index of it in
@@ -287,15 +287,13 @@ UniqueAndCompactBatched(
   // Utilizes a sort based algorithm, the mapped id of a vertex part of the
   // src_ids but not part of the unique_dst_ids will be monotonically increasing
   // as the actual vertex id increases. Thus, it is deterministic.
-  return UniqueAndCompactBatchedSortBased(
-      src_ids, dst_ids, unique_dst_ids, num_bits);
+  return UniqueAndCompactBatchedSortBased(src_ids, dst_ids, unique_dst_ids);
 }
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
-    const torch::Tensor unique_dst_ids, int num_bits) {
-  return UniqueAndCompactBatched(
-      {src_ids}, {dst_ids}, {unique_dst_ids}, num_bits)[0];
+    const torch::Tensor unique_dst_ids) {
+  return UniqueAndCompactBatched({src_ids}, {dst_ids}, {unique_dst_ids})[0];
 }
 
 }  // namespace ops

From d1161229e0f160c7116aff207e7f0ecf76362d8d Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 3 Sep 2024 17:45:03 -0400
Subject: [PATCH 69/78] [GraphBolt][CUDA] Cooperative Minibatching [1] -
 `UniqueAndCompact`. (#7765)

---
 graphbolt/include/graphbolt/cuda_ops.h        |  15 +-
 .../include/graphbolt/unique_and_compact.h    |  18 +-
 .../src/cuda/extension/gpu_graph_cache.cu     |   5 +-
 .../src/cuda/extension/unique_and_compact.h   |   3 +-
 .../cuda/extension/unique_and_compact_map.cu  | 222 +++++++++++-------
 graphbolt/src/cuda/unique_and_compact_impl.cu |  15 +-
 graphbolt/src/unique_and_compact.cc           |  30 ++-
 python/dgl/graphbolt/internal/sample_utils.py |   4 +-
 src/array/cuda/labor_sampling.cu              |   1 +
 third_party/cccl                              |   2 +-
 third_party/cuco                              |   2 +-
 11 files changed, 207 insertions(+), 110 deletions(-)

diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index 7ac22e178418..07feaeb6b5a2 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -274,10 +274,19 @@ torch::Tensor IndptrEdgeIdsImpl(
  *   2. Compact Operation: Utilizes the reverse mapping derived from the unique
  * operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
  *
+ * When world_size is greater than 1, then the given ids are partitioned between
+ * the available ranks. The ids corresponding to the given rank are guaranteed
+ * to come before the ids of other ranks. To do this, the partition ids are
+ * rotated backwards by the given rank so that the ids are ordered as:
+ * [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
+ * Volta and later generation NVIDIA GPUs.
+ *
  * @param src_ids         A tensor containing source IDs.
  * @param dst_ids         A tensor containing destination IDs.
  * @param unique_dst_ids  A tensor containing unique destination IDs, which is
  *                        exactly all the unique elements in 'dst_ids'.
+ * @param rank            The rank of the current GPU.
+ * @param world_size      The total # GPUs, world size.
  *
  * @return
  * - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -299,7 +308,8 @@ torch::Tensor IndptrEdgeIdsImpl(
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
-    const torch::Tensor unique_dst_ids);
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 /**
  * @brief Batched version of UniqueAndCompact. The ith element of the return
@@ -310,7 +320,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids);
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  //  namespace ops
 }  //  namespace graphbolt
diff --git a/graphbolt/include/graphbolt/unique_and_compact.h b/graphbolt/include/graphbolt/unique_and_compact.h
index bf3679688c75..db61c2b6f9fe 100644
--- a/graphbolt/include/graphbolt/unique_and_compact.h
+++ b/graphbolt/include/graphbolt/unique_and_compact.h
@@ -24,10 +24,19 @@ namespace sampling {
  *   2. Compact Operation: Utilizes the reverse mapping derived from the unique
  * operation to transform 'src_ids' and 'dst_ids' into compacted IDs.
  *
+ * When world_size is greater than 1, then the given ids are partitioned between
+ * the available ranks. The ids corresponding to the given rank are guaranteed
+ * to come before the ids of other ranks. To do this, the partition ids are
+ * rotated backwards by the given rank so that the ids are ordered as:
+ * [rank, rank + 1, world_size, 0, ..., rank - 1]. This is supported only for
+ * Volta and later generation NVIDIA GPUs.
+ *
  * @param src_ids         A tensor containing source IDs.
  * @param dst_ids         A tensor containing destination IDs.
  * @param unique_dst_ids  A tensor containing unique destination IDs, which is
  *                        exactly all the unique elements in 'dst_ids'.
+ * @param rank            The rank of the current GPU.
+ * @param world_size      The total # GPUs, world size.
  *
  * @return
  * - A tensor representing all unique elements in 'src_ids' and 'dst_ids' after
@@ -49,20 +58,23 @@ namespace sampling {
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor& src_ids, const torch::Tensor& dst_ids,
-    const torch::Tensor unique_dst_ids);
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids);
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 c10::intrusive_ptr<Future<
     std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
 UniqueAndCompactBatchedAsync(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids);
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  // namespace sampling
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/extension/gpu_graph_cache.cu b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
index c0e70421bf44..6fae172920a9 100644
--- a/graphbolt/src/cuda/extension/gpu_graph_cache.cu
+++ b/graphbolt/src/cuda/extension/gpu_graph_cache.cu
@@ -25,6 +25,7 @@
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
 #include <cuda/std/atomic>
+#include <cuda/stream_ref>
 #include <limits>
 #include <numeric>
 #include <type_traits>
@@ -138,7 +139,7 @@ GpuGraphCache::GpuGraphCache(
             {},
             {},
             allocator_t<index_t>{},
-            cuco::cuda_stream_ref{cuda::GetCurrentStream()}};
+            ::cuda::stream_ref{cuda::GetCurrentStream()}};
         map_ = new map_t<index_t>{std::move(map_temp)};
       }));
   C10_CUDA_KERNEL_LAUNCH_CHECK();  // Check the map constructor's success.
@@ -185,7 +186,7 @@ std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t> GpuGraphCache::Query(
             map_size_ + seeds.size(0) >= map->capacity() * kDoubleLoadFactor)) {
           map->rehash_async(
               map->capacity() * kIntGrowthFactor,
-              cuco::cuda_stream_ref{cuda::GetCurrentStream()});
+              ::cuda::stream_ref{cuda::GetCurrentStream()});
         }
         auto positions = torch::empty_like(seeds);
         CUDA_KERNEL_CALL(
diff --git a/graphbolt/src/cuda/extension/unique_and_compact.h b/graphbolt/src/cuda/extension/unique_and_compact.h
index 7079266878af..c68168e24fce 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact.h
+++ b/graphbolt/src/cuda/extension/unique_and_compact.h
@@ -32,7 +32,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
 UniqueAndCompactBatchedHashMapBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids);
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size);
 
 }  // namespace ops
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index cb0fdad24d2f..3806b272525a 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -17,11 +17,18 @@
  * @file cuda/unique_and_compact_map.cu
  * @brief Unique and compact operator implementation on CUDA using hash table.
  */
+#include <curand_kernel.h>
 #include <graphbolt/cuda_ops.h>
-#include <thrust/gather.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
+#include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
+#include <cuda/functional>
 #include <cuda/std/atomic>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+#include <limits>
 #include <numeric>
 
 #include "../common.h"
@@ -31,6 +38,20 @@
 namespace graphbolt {
 namespace ops {
 
+using part_t = uint8_t;
+constexpr auto kPartDType = torch::kUInt8;
+
+// Returns the rotated part id so that current rank's part id is 0.
+template <typename index_t>
+__device__ inline auto partition_assignment(
+    index_t id, uint32_t rank, uint32_t world_size) {
+  // Consider using a faster implementation in the future.
+  constexpr uint64_t kCurandSeed = 999961;  // Any random number.
+  curandStatePhilox4_32_10_t rng;
+  curand_init(kCurandSeed, 0, id, &rng);
+  return (curand(&rng) - rank) % world_size;
+}
+
 // Support graphs with up to 2^kNodeIdBits nodes.
 constexpr int kNodeIdBits = 40;
 
@@ -42,10 +63,10 @@ __global__ void _InsertAndSetMinBatched(
   const int stride = gridDim.x * blockDim.x;
 
   while (i < num_edges) {
-    const int64_t tensor_index = indexes[i];
+    const auto tensor_index = indexes[i];
     const auto tensor_offset = i - offsets[tensor_index];
     const int64_t node_id = pointers[tensor_index][tensor_offset];
-    const auto batch_index = tensor_index / 2;
+    const int64_t batch_index = tensor_index / 2;
     const int64_t key = node_id | (batch_index << kNodeIdBits);
 
     auto [slot, is_new_key] = map.insert_and_find(cuco::pair{key, i});
@@ -60,66 +81,17 @@ __global__ void _InsertAndSetMinBatched(
   }
 }
 
-template <typename index_t, typename map_t>
-__global__ void _IsInsertedBatched(
-    const int64_t num_edges, const int32_t* const indexes, index_t** pointers,
-    const int64_t* const offsets, map_t map, int64_t* valid) {
-  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  const int stride = gridDim.x * blockDim.x;
-
-  while (i < num_edges) {
-    const int64_t tensor_index = indexes[i];
-    const auto tensor_offset = i - offsets[tensor_index];
-    const int64_t node_id = pointers[tensor_index][tensor_offset];
-    const auto batch_index = tensor_index / 2;
-    const int64_t key = node_id | (batch_index << kNodeIdBits);
-
-    auto slot = map.find(key);
-    valid[i] = slot->second == i;
-
-    i += stride;
-  }
-}
-
-template <typename index_t, typename map_t>
-__global__ void _GetInsertedBatched(
-    const int64_t num_edges, const int32_t* const indexes, index_t** pointers,
-    const int64_t* const offsets, map_t map, const int64_t* const valid,
-    index_t* unique_ids) {
-  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  const int stride = gridDim.x * blockDim.x;
-
-  while (i < num_edges) {
-    const auto valid_i = valid[i];
-
-    if (valid_i + 1 == valid[i + 1]) {
-      const int64_t tensor_index = indexes[i];
-      const auto tensor_offset = i - offsets[tensor_index];
-      const int64_t node_id = pointers[tensor_index][tensor_offset];
-      const auto batch_index = tensor_index / 2;
-      const int64_t key = node_id | (batch_index << kNodeIdBits);
-
-      auto slot = map.find(key);
-      const auto batch_offset = offsets[batch_index * 2];
-      const auto new_id = valid_i - valid[batch_offset];
-      unique_ids[valid_i] = node_id;
-      slot->second = new_id;
-    }
-
-    i += stride;
-  }
-}
-
 template <typename index_t, typename map_t>
 __global__ void _MapIdsBatched(
     const int num_batches, const int64_t num_edges,
     const int32_t* const indexes, index_t** pointers,
-    const int64_t* const offsets, map_t map, index_t* mapped_ids) {
+    const int64_t* const offsets, const int64_t* const unique_ids_offsets,
+    const index_t* const index, map_t map, index_t* mapped_ids) {
   int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
   const int stride = gridDim.x * blockDim.x;
 
   while (i < num_edges) {
-    const int64_t tensor_index = indexes[i];
+    const auto tensor_index = indexes[i];
     int64_t batch_index;
 
     if (tensor_index >= 2 * num_batches) {
@@ -137,18 +109,24 @@ __global__ void _MapIdsBatched(
       const int64_t key = node_id | (batch_index << kNodeIdBits);
 
       auto slot = map.find(key);
-      mapped_ids[i] = slot->second;
+      auto new_id = slot->second;
+      if (index) new_id = index[new_id];
+      mapped_ids[i] = new_id - unique_ids_offsets[batch_index];
     }
 
     i += stride;
   }
 }
 
-std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatchedHashMapBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids) {
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
+  TORCH_CHECK(
+      rank < world_size, "rank needs to be smaller than the world_size.");
+  TORCH_CHECK(world_size <= std::numeric_limits<uint32_t>::max());
   auto allocator = cuda::GetAllocator();
   auto stream = cuda::GetCurrentStream();
   auto scalar_type = src_ids.at(0).scalar_type();
@@ -211,11 +189,11 @@ UniqueAndCompactBatchedHashMapBased(
             cuco::empty_key{static_cast<int64_t>(-1)},
             cuco::empty_value{static_cast<int64_t>(-1)},
             {},
-            cuco::linear_probing<1, cuco::default_hash_function<int64_t> >{},
+            cuco::linear_probing<1, cuco::default_hash_function<int64_t>>{},
             {},
             {},
-            cuda::CUDAWorkspaceAllocator<cuco::pair<int64_t, int64_t> >{},
-            cuco::cuda_stream_ref{stream},
+            cuda::CUDAWorkspaceAllocator<cuco::pair<int64_t, int64_t>>{},
+            ::cuda::stream_ref{stream},
         };
         C10_CUDA_KERNEL_LAUNCH_CHECK();  // Check the map constructor's success.
         const dim3 block(BLOCK_SIZE);
@@ -225,42 +203,116 @@ UniqueAndCompactBatchedHashMapBased(
             _InsertAndSetMinBatched, grid, block, 0,
             offsets_ptr[2 * num_batches], indexes.data_ptr<int32_t>(),
             pointers_dev_ptr, offsets_dev_ptr, map.ref(cuco::insert_and_find));
-        auto valid = torch::empty(
-            offsets_ptr[2 * num_batches] + 1,
-            src_ids[0].options().dtype(torch::kInt64));
-        CUDA_KERNEL_CALL(
-            _IsInsertedBatched, grid, block, 0, offsets_ptr[2 * num_batches],
-            indexes.data_ptr<int32_t>(), pointers_dev_ptr, offsets_dev_ptr,
-            map.ref(cuco::find), valid.data_ptr<int64_t>());
-        valid = ExclusiveCumSum(valid);
+        cub::ArgIndexInputIterator index_it(indexes.data_ptr<int32_t>());
+        auto input_it = thrust::make_transform_iterator(
+            index_it,
+            ::cuda::proclaim_return_type<
+                ::cuda::std::tuple<int64_t*, index_t, int32_t, bool, bool>>(
+                [=, map = map.ref(cuco::find)] __device__(auto it)
+                    -> ::cuda::std::tuple<
+                        int64_t*, index_t, int32_t, bool, bool> {
+                  const auto i = it.key;
+                  const auto tensor_index = it.value;
+                  const auto tensor_offset = i - offsets_dev_ptr[tensor_index];
+                  const int64_t node_id =
+                      pointers_dev_ptr[tensor_index][tensor_offset];
+                  const auto batch_index = tensor_index / 2;
+                  const int64_t key =
+                      node_id |
+                      (static_cast<int64_t>(batch_index) << kNodeIdBits);
+                  const auto batch_offset = offsets_dev_ptr[batch_index * 2];
+
+                  auto slot = map.find(key);
+                  const auto valid = slot->second == i;
+
+                  return {
+                      &slot->second, node_id, batch_index, valid,
+                      i == batch_offset};
+                }));
+        torch::optional<torch::Tensor> part_ids;
+        if (world_size > 1) {
+          part_ids = torch::empty(
+              offsets_ptr[2 * num_batches],
+              src_ids[0].options().dtype(kPartDType));
+        }
+        auto unique_ids =
+            torch::empty(offsets_ptr[2 * num_batches], src_ids[0].options());
+        auto unique_ids_offsets_dev = torch::empty(
+            num_batches + 1, src_ids[0].options().dtype(torch::kInt64));
+        auto unique_ids_offsets_dev_ptr =
+            unique_ids_offsets_dev.data_ptr<int64_t>();
+        auto output_it = thrust::make_tabulate_output_iterator(
+            ::cuda::proclaim_return_type<void>(
+                [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
+                 part_ids_ptr =
+                     part_ids ? part_ids->data_ptr<part_t>() : nullptr,
+                 rank = static_cast<uint32_t>(rank),
+                 world_size = static_cast<uint32_t>(
+                     world_size)] __device__(const int64_t i, const auto& t) {
+                  *::cuda::std::get<0>(t) = i;
+                  const auto node_id = ::cuda::std::get<1>(t);
+                  const auto is_i_equal_batch_offset = ::cuda::std::get<4>(t);
+                  unique_ids_ptr[i] = node_id;
+                  if (part_ids_ptr) {
+                    part_ids_ptr[i] =
+                        partition_assignment(node_id, rank, world_size);
+                  }
+                  if (is_i_equal_batch_offset) {
+                    const auto batch_index = ::cuda::std::get<2>(t);
+                    unique_ids_offsets_dev_ptr[batch_index] = i;
+                  }
+                }));
+        CUB_CALL(
+            DeviceSelect::If, input_it, output_it,
+            unique_ids_offsets_dev_ptr + num_batches,
+            offsets_ptr[2 * num_batches],
+            ::cuda::proclaim_return_type<bool>([] __device__(const auto& t) {
+              return ::cuda::std::get<3>(t);
+            }));
         auto unique_ids_offsets = torch::empty(
             num_batches + 1,
             c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true));
         auto unique_ids_offsets_ptr = unique_ids_offsets.data_ptr<int64_t>();
-        for (int64_t i = 0; i <= num_batches; i++) {
-          unique_ids_offsets_ptr[i] = offsets_ptr[2 * i];
-        }
-        THRUST_CALL(
-            gather, unique_ids_offsets_ptr,
-            unique_ids_offsets_ptr + unique_ids_offsets.size(0),
-            valid.data_ptr<int64_t>(), unique_ids_offsets_ptr);
+        CUDA_CALL(cudaMemcpyAsync(
+            unique_ids_offsets_ptr, unique_ids_offsets_dev_ptr,
+            sizeof(int64_t) * (num_batches + 1), cudaMemcpyDeviceToHost,
+            stream));
         at::cuda::CUDAEvent unique_ids_offsets_event;
         unique_ids_offsets_event.record();
-        auto unique_ids =
-            torch::empty(offsets_ptr[2 * num_batches], src_ids[0].options());
-        CUDA_KERNEL_CALL(
-            _GetInsertedBatched, grid, block, 0, offsets_ptr[2 * num_batches],
-            indexes.data_ptr<int32_t>(), pointers_dev_ptr, offsets_dev_ptr,
-            map.ref(cuco::find), valid.data_ptr<int64_t>(),
-            unique_ids.data_ptr<index_t>());
+        torch::optional<torch::Tensor> index;
+        if (part_ids) {
+          auto part_ids_sorted = torch::empty_like(*part_ids);
+          auto part_ids2 = part_ids->clone();
+          auto part_ids2_sorted = torch::empty_like(part_ids2);
+          auto unique_ids_sorted = torch::empty_like(unique_ids);
+          index = torch::arange(unique_ids.size(0), unique_ids.options());
+          auto index_sorted = torch::empty_like(*index);
+          CUB_CALL(
+              DeviceSegmentedRadixSort::SortPairs, part_ids->data_ptr<part_t>(),
+              part_ids_sorted.data_ptr<part_t>(),
+              unique_ids.data_ptr<index_t>(),
+              unique_ids_sorted.data_ptr<index_t>(), unique_ids.size(0),
+              num_batches, unique_ids_offsets_dev_ptr,
+              unique_ids_offsets_dev_ptr + 1, 0,
+              cuda::NumberOfBits(world_size));
+          unique_ids = unique_ids_sorted;
+          CUB_CALL(
+              DeviceSegmentedRadixSort::SortPairs, part_ids2.data_ptr<part_t>(),
+              part_ids2_sorted.data_ptr<part_t>(), index->data_ptr<index_t>(),
+              index_sorted.data_ptr<index_t>(), unique_ids.size(0), num_batches,
+              unique_ids_offsets_dev_ptr, unique_ids_offsets_dev_ptr + 1, 0,
+              cuda::NumberOfBits(world_size));
+          index = index_sorted;
+        }
         auto mapped_ids =
             torch::empty(offsets_ptr[3 * num_batches], unique_ids.options());
         CUDA_KERNEL_CALL(
             _MapIdsBatched, grid, block, 0, num_batches,
             offsets_ptr[3 * num_batches], indexes.data_ptr<int32_t>(),
-            pointers_dev_ptr, offsets_dev_ptr, map.ref(cuco::find),
+            pointers_dev_ptr, offsets_dev_ptr, unique_ids_offsets_dev_ptr,
+            index ? index->data_ptr<index_t>() : nullptr, map.ref(cuco::find),
             mapped_ids.data_ptr<index_t>());
-        std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
+        std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
             results;
         unique_ids_offsets_event.synchronize();
         for (int64_t i = 0; i < num_batches; i++) {
diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu
index 5407777daeea..c8b5775c5b47 100644
--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
@@ -276,14 +276,19 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor>& unique_dst_ids) {
+    const std::vector<torch::Tensor>& unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
   if (cuda::compute_capability() >= 70) {
     // Utilizes a hash table based implementation, the mapped id of a vertex
     // will be monotonically increasing as the first occurrence index of it in
     // torch.cat([unique_dst_ids, src_ids]). Thus, it is deterministic.
     return UniqueAndCompactBatchedHashMapBased(
-        src_ids, dst_ids, unique_dst_ids);
+        src_ids, dst_ids, unique_dst_ids, rank, world_size);
   }
+  TORCH_CHECK(
+      world_size <= 1,
+      "Cooperative Minibatching (arXiv:2310.12403) is not supported on "
+      "pre-Volta generation GPUs.");
   // Utilizes a sort based algorithm, the mapped id of a vertex part of the
   // src_ids but not part of the unique_dst_ids will be monotonically increasing
   // as the actual vertex id increases. Thus, it is deterministic.
@@ -292,8 +297,10 @@ UniqueAndCompactBatched(
 
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor src_ids, const torch::Tensor dst_ids,
-    const torch::Tensor unique_dst_ids) {
-  return UniqueAndCompactBatched({src_ids}, {dst_ids}, {unique_dst_ids})[0];
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
+  return UniqueAndCompactBatched(
+      {src_ids}, {dst_ids}, {unique_dst_ids}, rank, world_size)[0];
 }
 
 }  // namespace ops
diff --git a/graphbolt/src/unique_and_compact.cc b/graphbolt/src/unique_and_compact.cc
index ba07ede8fc71..03fb8f514f2d 100644
--- a/graphbolt/src/unique_and_compact.cc
+++ b/graphbolt/src/unique_and_compact.cc
@@ -16,13 +16,19 @@ namespace graphbolt {
 namespace sampling {
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
     const torch::Tensor& src_ids, const torch::Tensor& dst_ids,
-    const torch::Tensor unique_dst_ids) {
+    const torch::Tensor unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
   if (utils::is_on_gpu(src_ids) && utils::is_on_gpu(dst_ids) &&
       utils::is_on_gpu(unique_dst_ids)) {
     GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
-        c10::DeviceType::CUDA, "unique_and_compact",
-        { return ops::UniqueAndCompact(src_ids, dst_ids, unique_dst_ids); });
+        c10::DeviceType::CUDA, "unique_and_compact", {
+          return ops::UniqueAndCompact(
+              src_ids, dst_ids, unique_dst_ids, rank, world_size);
+        });
   }
+  TORCH_CHECK(
+      world_size <= 1,
+      "Cooperative Minibatching (arXiv:2310.12403) is supported only on GPUs.");
   auto num_dst = unique_dst_ids.size(0);
   torch::Tensor ids = torch::cat({unique_dst_ids, src_ids});
   return AT_DISPATCH_INDEX_TYPES(
@@ -38,7 +44,8 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids) {
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
   TORCH_CHECK(
       src_ids.size() == dst_ids.size() &&
           dst_ids.size() == unique_dst_ids.size(),
@@ -53,14 +60,15 @@ UniqueAndCompactBatched(
   if (all_on_gpu) {
     GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
         c10::DeviceType::CUDA, "unique_and_compact", {
-          return ops::UniqueAndCompactBatched(src_ids, dst_ids, unique_dst_ids);
+          return ops::UniqueAndCompactBatched(
+              src_ids, dst_ids, unique_dst_ids, rank, world_size);
         });
   }
   std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> results;
   results.reserve(src_ids.size());
   for (std::size_t i = 0; i < src_ids.size(); i++) {
-    results.emplace_back(
-        UniqueAndCompact(src_ids[i], dst_ids[i], unique_dst_ids[i]));
+    results.emplace_back(UniqueAndCompact(
+        src_ids[i], dst_ids[i], unique_dst_ids[i], rank, world_size));
   }
   return results;
 }
@@ -70,9 +78,13 @@ c10::intrusive_ptr<Future<
 UniqueAndCompactBatchedAsync(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
-    const std::vector<torch::Tensor> unique_dst_ids) {
+    const std::vector<torch::Tensor> unique_dst_ids, const int64_t rank,
+    const int64_t world_size) {
   return async(
-      [=] { return UniqueAndCompactBatched(src_ids, dst_ids, unique_dst_ids); },
+      [=] {
+        return UniqueAndCompactBatched(
+            src_ids, dst_ids, unique_dst_ids, rank, world_size);
+      },
       utils::is_on_gpu(src_ids.at(0)));
 }
 
diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py
index ba732ff20810..d840d5bf4ad5 100644
--- a/python/dgl/graphbolt/internal/sample_utils.py
+++ b/python/dgl/graphbolt/internal/sample_utils.py
@@ -44,7 +44,7 @@ def unique_and_compact_per_type(nodes):
         nodes = torch.cat(nodes)
         empty_tensor = nodes.new_empty(0)
         unique, compacted, _ = torch.ops.graphbolt.unique_and_compact(
-            nodes, empty_tensor, empty_tensor
+            nodes, empty_tensor, empty_tensor, 0, 1
         )
         compacted = compacted.split(nums)
         return unique, list(compacted)
@@ -218,7 +218,7 @@ def unique_and_compact_csc_formats(
         if async_op
         else torch.ops.graphbolt.unique_and_compact_batched
     )
-    results = unique_fn(indice_list, dst_list, unique_dst_list)
+    results = unique_fn(indice_list, dst_list, unique_dst_list, 0, 1)
 
     class _Waiter:
         def __init__(self, future, csc_formats):
diff --git a/src/array/cuda/labor_sampling.cu b/src/array/cuda/labor_sampling.cu
index 653e9de69cbb..c5076e62981d 100644
--- a/src/array/cuda/labor_sampling.cu
+++ b/src/array/cuda/labor_sampling.cu
@@ -29,6 +29,7 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
 #include <thrust/shuffle.h>
 #include <thrust/transform.h>
 #include <thrust/zip_function.h>
diff --git a/third_party/cccl b/third_party/cccl
index c67b1c3257be..709ddec37ff8 160000
--- a/third_party/cccl
+++ b/third_party/cccl
@@ -1 +1 @@
-Subproject commit c67b1c3257be5115253f06d45a2d607b54234db4
+Subproject commit 709ddec37ff87e6087097ed6e49526dac21dcbc9
diff --git a/third_party/cuco b/third_party/cuco
index 6c0d7ee190a1..4454de4b878f 160000
--- a/third_party/cuco
+++ b/third_party/cuco
@@ -1 +1 @@
-Subproject commit 6c0d7ee190a167c6976af0f7c628a43d13b78f38
+Subproject commit 4454de4b878f31d41c5b7578fe6ca24bba5ea3f4

From c4c00370475432152a709850a756b1b81ad76cb9 Mon Sep 17 00:00:00 2001
From: Renjie LIU <liurj2019@mail.sustech.edu.cn>
Date: Wed, 4 Sep 2024 08:09:33 +0800
Subject: [PATCH 70/78] Include <optional> header file in utils.h (#7763)

---
 graphbolt/src/utils.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/graphbolt/src/utils.h b/graphbolt/src/utils.h
index 886dca0a8f77..37a8c78489cb 100644
--- a/graphbolt/src/utils.h
+++ b/graphbolt/src/utils.h
@@ -9,6 +9,8 @@
 
 #include <torch/script.h>
 
+#include <optional>
+
 namespace graphbolt {
 namespace utils {
 

From 7aabe208b267cdc8067231ad00ea2680b3f272b4 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 4 Sep 2024 00:08:48 -0400
Subject: [PATCH 71/78] [GraphBolt][CUDA] Refine the added partition function
 for later reuse. (#7769)

---
 .../cuda/cooperative_minibatching_utils.cu    | 115 ++++++++++++++++++
 .../src/cuda/cooperative_minibatching_utils.h | 104 ++++++++++++++++
 .../cuda/extension/unique_and_compact_map.cu  |  46 +------
 3 files changed, 225 insertions(+), 40 deletions(-)
 create mode 100644 graphbolt/src/cuda/cooperative_minibatching_utils.cu
 create mode 100644 graphbolt/src/cuda/cooperative_minibatching_utils.h

diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
new file mode 100644
index 000000000000..8a632bb809b4
--- /dev/null
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -0,0 +1,115 @@
+/**
+ *   Copyright (c) 2024, mfbalin (Muhammed Fatih Balin)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cuda/cooperative_minibatching_utils.cu
+ * @brief Cooperative Minibatching (arXiv:2310.12403) utility function
+ * implementations in CUDA.
+ */
+#include <thrust/transform.h>
+
+#include <cub/cub.cuh>
+#include <cuda/functional>
+
+#include "./common.h"
+#include "./cooperative_minibatching_utils.h"
+#include "./utils.h"
+
+namespace graphbolt {
+namespace cuda {
+
+torch::Tensor RankAssignment(
+    torch::Tensor nodes, const int64_t rank, const int64_t world_size) {
+  auto part_ids = torch::empty_like(nodes, nodes.options().dtype(kPartDType));
+  auto part_ids_ptr = part_ids.data_ptr<part_t>();
+  AT_DISPATCH_INDEX_TYPES(
+      nodes.scalar_type(), "RankAssignment", ([&] {
+        auto nodes_ptr = nodes.data_ptr<index_t>();
+        THRUST_CALL(
+            transform, nodes_ptr, nodes_ptr + nodes.numel(), part_ids_ptr,
+            ::cuda::proclaim_return_type<part_t>(
+                [rank = static_cast<uint32_t>(rank),
+                 world_size = static_cast<uint32_t>(
+                     world_size)] __device__(index_t id) -> part_t {
+                  return rank_assignment(id, rank, world_size);
+                }));
+      }));
+  return part_ids;
+}
+
+std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
+    torch::Tensor nodes, torch::Tensor part_ids, torch::Tensor offsets_dev,
+    const int64_t world_size) {
+  const int num_bits = cuda::NumberOfBits(world_size);
+  auto offsets_dev_ptr = offsets_dev.data_ptr<int64_t>();
+  auto part_ids_sorted = torch::empty_like(part_ids);
+  auto part_ids2 = part_ids.clone();
+  auto part_ids2_sorted = torch::empty_like(part_ids2);
+  auto nodes_sorted = torch::empty_like(nodes);
+  auto index = torch::arange(nodes.numel(), nodes.options());
+  auto index_sorted = torch::empty_like(index);
+  AT_DISPATCH_INDEX_TYPES(
+      nodes.scalar_type(), "RankSortImpl", ([&] {
+        CUB_CALL(
+            DeviceSegmentedRadixSort::SortPairs,
+            part_ids.data_ptr<cuda::part_t>(),
+            part_ids_sorted.data_ptr<cuda::part_t>(), nodes.data_ptr<index_t>(),
+            nodes_sorted.data_ptr<index_t>(), nodes.numel(),
+            offsets_dev.numel() - 1, offsets_dev_ptr, offsets_dev_ptr + 1, 0,
+            num_bits);
+        CUB_CALL(
+            DeviceSegmentedRadixSort::SortPairs,
+            part_ids2.data_ptr<cuda::part_t>(),
+            part_ids2_sorted.data_ptr<cuda::part_t>(),
+            index.data_ptr<index_t>(), index_sorted.data_ptr<index_t>(),
+            nodes.numel(), offsets_dev.numel() - 1, offsets_dev_ptr,
+            offsets_dev_ptr + 1, 0, num_bits);
+      }));
+  return {nodes_sorted, index_sorted};
+}
+
+std::vector<std::tuple<torch::Tensor, torch::Tensor>> RankSort(
+    std::vector<torch::Tensor>& nodes_list, const int64_t rank,
+    const int64_t world_size) {
+  const auto num_batches = nodes_list.size();
+  auto nodes = torch::cat(nodes_list, 0);
+  auto offsets = torch::empty(
+      num_batches + 1,
+      c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true));
+  auto offsets_ptr = offsets.data_ptr<int64_t>();
+  offsets_ptr[0] = 0;
+  for (int64_t i = 0; i < num_batches; i++) {
+    offsets_ptr[i + 1] = offsets_ptr[i] + nodes_list[i].numel();
+  }
+  auto part_ids = RankAssignment(nodes, rank, world_size);
+  auto offsets_dev =
+      torch::empty_like(offsets, nodes.options().dtype(offsets.scalar_type()));
+  CUDA_CALL(cudaMemcpyAsync(
+      offsets_dev.data_ptr<int64_t>(), offsets_ptr,
+      sizeof(int64_t) * offsets.numel(), cudaMemcpyHostToDevice,
+      cuda::GetCurrentStream()));
+  auto [nodes_sorted, index_sorted] =
+      RankSortImpl(nodes, part_ids, offsets_dev, world_size);
+  std::vector<std::tuple<torch::Tensor, torch::Tensor>> results;
+  for (int64_t i = 0; i < num_batches; i++) {
+    results.emplace_back(
+        nodes_sorted.slice(0, offsets_ptr[i], offsets_ptr[i + 1]),
+        index_sorted.slice(0, offsets_ptr[i], offsets_ptr[i + 1]));
+  }
+  return results;
+}
+
+}  // namespace cuda
+}  // namespace graphbolt
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
new file mode 100644
index 000000000000..ba9357063b72
--- /dev/null
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -0,0 +1,104 @@
+/**
+ *   Copyright (c) 2024, mfbalin (Muhammed Fatih Balin)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cuda/cooperative_minibatching_utils.h
+ * @brief Cooperative Minibatching (arXiv:2310.12403) utility function headers
+ * in CUDA.
+ */
+#ifndef GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
+#define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
+
+#include <curand_kernel.h>
+#include <torch/script.h>
+
+namespace graphbolt {
+namespace cuda {
+
+using part_t = uint8_t;
+constexpr auto kPartDType = torch::kUInt8;
+
+/**
+ * @brief Given a vertex id, the rank of current GPU and the world size, returns
+ * the rank that this id belongs in a deterministic manner.
+ *
+ * @param id         The node id that will mapped to a rank in [0, world_size).
+ * @param rank       The rank of the current GPU.
+ * @param world_size The world size, the total number of cooperating GPUs.
+ *
+ * @return The rank of the GPU the given id is mapped to.
+ */
+template <typename index_t>
+__device__ inline auto rank_assignment(
+    index_t id, uint32_t rank, uint32_t world_size) {
+  // Consider using a faster implementation in the future.
+  constexpr uint64_t kCurandSeed = 999961;  // Any random number.
+  curandStatePhilox4_32_10_t rng;
+  curand_init(kCurandSeed, 0, id, &rng);
+  return (curand(&rng) - rank) % world_size;
+}
+
+/**
+ * @brief Given node ids, the rank of current GPU and the world size, returns
+ * the ranks that the given ids belong in a deterministic manner.
+ *
+ * @param nodes      Node id tensor to be mapped to a rank in [0, world_size).
+ * @param rank       Rank of the current GPU.
+ * @param world_size World size, the total number of cooperating GPUs.
+ *
+ * @return The rank tensor of the GPU the given id tensor is mapped to.
+ */
+torch::Tensor RankAssignment(
+    torch::Tensor nodes, int64_t rank, int64_t world_size);
+
+/**
+ * @brief Given node ids, the ranks they belong, the offsets to separate
+ * different node types and num_bits indicating the world size is <= 2^num_bits,
+ * returns node ids sorted w.r.t. the ranks that the given ids belong along with
+ * the original positions.
+ *
+ * @param nodes        Node id tensor to be mapped to a rank in [0, world_size).
+ * @param part_ids     Rank tensor the nodes belong to.
+ * @param offsets_dev  Offsets to separate different node types.
+ * @param world_size   World size, the total number of cooperating GPUs.
+ *
+ * @return (sorted_nodes, original_positions), where the first
+ * one includes sorted nodes, the second contains original positions of the
+ * sorted nodes.
+ */
+std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
+    torch::Tensor nodes, torch::Tensor part_ids, torch::Tensor offsets_dev,
+    int64_t world_size);
+
+/**
+ * @brief Given a vector of node ids, the rank of current GPU and the world
+ * size, returns node ids sorted w.r.t. the ranks that the given ids belong
+ * along with the original positions.
+ *
+ * @param nodes_list   Node id tensor to be mapped to a rank in [0, world_size).
+ * @param rank         Rank of the current GPU.
+ * @param world_size   World size, the total number of cooperating GPUs.
+ *
+ * @return vector of (sorted_nodes, original_positions), where the first
+ * one includes sorted nodes, the second contains original positions of the
+ * sorted nodes.
+ */
+std::vector<std::tuple<torch::Tensor, torch::Tensor>> RankSort(
+    std::vector<torch::Tensor>& nodes_list, int64_t rank, int64_t world_size);
+
+}  // namespace cuda
+}  // namespace graphbolt
+
+#endif  // GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index 3806b272525a..ed14127d7b6f 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -17,7 +17,6 @@
  * @file cuda/unique_and_compact_map.cu
  * @brief Unique and compact operator implementation on CUDA using hash table.
  */
-#include <curand_kernel.h>
 #include <graphbolt/cuda_ops.h>
 #include <thrust/iterator/tabulate_output_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -32,26 +31,13 @@
 #include <numeric>
 
 #include "../common.h"
+#include "../cooperative_minibatching_utils.h"
 #include "../utils.h"
 #include "./unique_and_compact.h"
 
 namespace graphbolt {
 namespace ops {
 
-using part_t = uint8_t;
-constexpr auto kPartDType = torch::kUInt8;
-
-// Returns the rotated part id so that current rank's part id is 0.
-template <typename index_t>
-__device__ inline auto partition_assignment(
-    index_t id, uint32_t rank, uint32_t world_size) {
-  // Consider using a faster implementation in the future.
-  constexpr uint64_t kCurandSeed = 999961;  // Any random number.
-  curandStatePhilox4_32_10_t rng;
-  curand_init(kCurandSeed, 0, id, &rng);
-  return (curand(&rng) - rank) % world_size;
-}
-
 // Support graphs with up to 2^kNodeIdBits nodes.
 constexpr int kNodeIdBits = 40;
 
@@ -233,7 +219,7 @@ UniqueAndCompactBatchedHashMapBased(
         if (world_size > 1) {
           part_ids = torch::empty(
               offsets_ptr[2 * num_batches],
-              src_ids[0].options().dtype(kPartDType));
+              src_ids[0].options().dtype(cuda::kPartDType));
         }
         auto unique_ids =
             torch::empty(offsets_ptr[2 * num_batches], src_ids[0].options());
@@ -245,7 +231,7 @@ UniqueAndCompactBatchedHashMapBased(
             ::cuda::proclaim_return_type<void>(
                 [=, unique_ids_ptr = unique_ids.data_ptr<index_t>(),
                  part_ids_ptr =
-                     part_ids ? part_ids->data_ptr<part_t>() : nullptr,
+                     part_ids ? part_ids->data_ptr<cuda::part_t>() : nullptr,
                  rank = static_cast<uint32_t>(rank),
                  world_size = static_cast<uint32_t>(
                      world_size)] __device__(const int64_t i, const auto& t) {
@@ -255,7 +241,7 @@ UniqueAndCompactBatchedHashMapBased(
                   unique_ids_ptr[i] = node_id;
                   if (part_ids_ptr) {
                     part_ids_ptr[i] =
-                        partition_assignment(node_id, rank, world_size);
+                        cuda::rank_assignment(node_id, rank, world_size);
                   }
                   if (is_i_equal_batch_offset) {
                     const auto batch_index = ::cuda::std::get<2>(t);
@@ -281,28 +267,8 @@ UniqueAndCompactBatchedHashMapBased(
         unique_ids_offsets_event.record();
         torch::optional<torch::Tensor> index;
         if (part_ids) {
-          auto part_ids_sorted = torch::empty_like(*part_ids);
-          auto part_ids2 = part_ids->clone();
-          auto part_ids2_sorted = torch::empty_like(part_ids2);
-          auto unique_ids_sorted = torch::empty_like(unique_ids);
-          index = torch::arange(unique_ids.size(0), unique_ids.options());
-          auto index_sorted = torch::empty_like(*index);
-          CUB_CALL(
-              DeviceSegmentedRadixSort::SortPairs, part_ids->data_ptr<part_t>(),
-              part_ids_sorted.data_ptr<part_t>(),
-              unique_ids.data_ptr<index_t>(),
-              unique_ids_sorted.data_ptr<index_t>(), unique_ids.size(0),
-              num_batches, unique_ids_offsets_dev_ptr,
-              unique_ids_offsets_dev_ptr + 1, 0,
-              cuda::NumberOfBits(world_size));
-          unique_ids = unique_ids_sorted;
-          CUB_CALL(
-              DeviceSegmentedRadixSort::SortPairs, part_ids2.data_ptr<part_t>(),
-              part_ids2_sorted.data_ptr<part_t>(), index->data_ptr<index_t>(),
-              index_sorted.data_ptr<index_t>(), unique_ids.size(0), num_batches,
-              unique_ids_offsets_dev_ptr, unique_ids_offsets_dev_ptr + 1, 0,
-              cuda::NumberOfBits(world_size));
-          index = index_sorted;
+          std::tie(unique_ids, index) = cuda::RankSortImpl(
+              unique_ids, *part_ids, unique_ids_offsets_dev, world_size);
         }
         auto mapped_ids =
             torch::empty(offsets_ptr[3 * num_batches], unique_ids.options());

From b60ad557f5e0fad7f46017408cdf7151a6b4b620 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:55:58 -0500
Subject: [PATCH 72/78] [Graphbolt][Dataset] Add igb-hom-large to scripts
 (#7770)

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 .../graphbolt/disk_based_feature/node_classification.py     | 1 +
 examples/graphbolt/node_classification.py                   | 1 +
 examples/graphbolt/pyg/labor/node_classification.py         | 1 +
 examples/graphbolt/pyg/node_classification_advanced.py      | 3 ++-
 python/dgl/graphbolt/impl/ondisk_dataset.py                 | 6 ++++--
 5 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/graphbolt/disk_based_feature/node_classification.py b/examples/graphbolt/disk_based_feature/node_classification.py
index 450a61136593..144c5059a2f2 100644
--- a/examples/graphbolt/disk_based_feature/node_classification.py
+++ b/examples/graphbolt/disk_based_feature/node_classification.py
@@ -339,6 +339,7 @@ def parse_args():
             "igb-hom-tiny",
             "igb-hom-small",
             "igb-hom-medium",
+            "igb-hom-large",
         ],
     )
     parser.add_argument("--root", type=str, default="datasets")
diff --git a/examples/graphbolt/node_classification.py b/examples/graphbolt/node_classification.py
index 6b0dee719746..810f41b679dd 100644
--- a/examples/graphbolt/node_classification.py
+++ b/examples/graphbolt/node_classification.py
@@ -370,6 +370,7 @@ def parse_args():
             "igb-hom-tiny",
             "igb-hom-small",
             "igb-hom-medium",
+            "igb-hom-large",
         ],
         help="The dataset we can use for node classification example. Currently"
         " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
diff --git a/examples/graphbolt/pyg/labor/node_classification.py b/examples/graphbolt/pyg/labor/node_classification.py
index 7129d517c8c6..e42aed10c3a6 100644
--- a/examples/graphbolt/pyg/labor/node_classification.py
+++ b/examples/graphbolt/pyg/labor/node_classification.py
@@ -366,6 +366,7 @@ def parse_args():
             "igb-hom-tiny",
             "igb-hom-small",
             "igb-hom-medium",
+            "igb-hom-large",
             "reddit",
             "yelp",
             "flickr",
diff --git a/examples/graphbolt/pyg/node_classification_advanced.py b/examples/graphbolt/pyg/node_classification_advanced.py
index 576335b5bdf1..743fc4d3cf02 100644
--- a/examples/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/graphbolt/pyg/node_classification_advanced.py
@@ -342,10 +342,11 @@ def parse_args():
             "igb-hom-tiny",
             "igb-hom-small",
             "igb-hom-medium",
+            "igb-hom-large",
         ],
         help="The dataset we can use for node classification example. Currently"
         " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
-        " igb-hom-[tiny|small|medium] datasets are supported.",
+        " igb-hom-[tiny|small|medium|large] datasets are supported.",
     )
     parser.add_argument(
         "--fanout",
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index df3b51f8b074..3318491d2888 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -979,10 +979,10 @@ class BuiltinDataset(OnDiskDataset):
         .. note::
             Reverse edges are added to the original graph.
 
-    **igb-hom-[tiny|small|medium]**
+    **igb-hom-[tiny|small|medium|large]**
         The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
         which is designed for developers to train and evaluate GNN models with
-        high fidelity. See more details in `igb-hom-[tiny|small|medium]
+        high fidelity. See more details in `igb-hom-[tiny|small|medium|large]
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
@@ -1026,6 +1026,8 @@ class BuiltinDataset(OnDiskDataset):
         "ogbn-papers100M-seeds",
         "igb-hom-medium",
         "igb-hom-medium-seeds",
+        "igb-hom-large",
+        "igb-hom-large-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 8fd6c6803197c9b2c4cb2442113dd7a6fede70a0 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 4 Sep 2024 01:27:50 -0400
Subject: [PATCH 73/78] [GraphBolt][Doc] Clarify `gb.fused_csc_sampling_graph`
 `type_per_edge` arg. (#7771)

---
 python/dgl/graphbolt/impl/fused_csc_sampling_graph.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
index cc9137092cf6..5c06920e8cd4 100644
--- a/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py
@@ -1589,7 +1589,11 @@ def fused_csc_sampling_graph(
     node_type_offset : Optional[torch.tensor], optional
         Offset of node types in the graph, by default None.
     type_per_edge : Optional[torch.tensor], optional
-        Type ids of each edge in the graph, by default None.
+        Type ids of each edge in the graph, by default None. If provided, it is
+        required that the edge types in each vertex neighborhood are in sorted
+        order. To be more precise, For each i in [0, csc_indptr.size(0) - 1),
+        `type_per_edge[indptr[i]: indptr[i + 1]]` is expected to be
+        monotonically nondecreasing.
     node_type_to_id : Optional[Dict[str, int]], optional
         Map node types to ids, by default None.
     edge_type_to_id : Optional[Dict[str, int]], optional

From 48f47d50a73bb328af6681e943d4ac792e182bdb Mon Sep 17 00:00:00 2001
From: Stanley Yang <guangyg@cs.washington.edu>
Date: Wed, 4 Sep 2024 16:33:35 +0800
Subject: [PATCH 74/78] [UnitTest] Fix "[UnitTest] AttributeError: module
 'numpy' has no attribute 'ComplexWarning'" in #7766 (#7768)

Co-authored-by: Stanley Yang <stanyang@amazon.com>
Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 tests/python/common/sampling/test_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/common/sampling/test_sampling.py b/tests/python/common/sampling/test_sampling.py
index 74c2b000ba96..f52ea3c2e5dc 100644
--- a/tests/python/common/sampling/test_sampling.py
+++ b/tests/python/common/sampling/test_sampling.py
@@ -1774,7 +1774,7 @@ def contain_edge(g, sg, u, v):
 
 @pytest.mark.parametrize("dtype", ["int32", "int64"])
 def test_global_uniform_negative_sampling(dtype):
-    warnings.simplefilter("ignore", np.ComplexWarning)
+    warnings.simplefilter("ignore", np.exceptions.ComplexWarning)
     g = dgl.graph(([], []), num_nodes=1000).to(F.ctx())
     src, dst = dgl.sampling.global_uniform_negative_sampling(
         g, 2000, False, True

From 4c8653339c948d8cad710c4013dd8e7a708798ed Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 5 Sep 2024 00:00:36 -0400
Subject: [PATCH 75/78] [GraphBolt][CUDA] Fix hetero `UniqueAndCompact` bug.
 (#7773)

---
 .../cuda/extension/unique_and_compact_map.cu  | 53 +++++++++++++------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index ed14127d7b6f..48e9941a7f77 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -18,8 +18,10 @@
  * @brief Unique and compact operator implementation on CUDA using hash table.
  */
 #include <graphbolt/cuda_ops.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/tabulate_output_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
@@ -193,10 +195,9 @@ UniqueAndCompactBatchedHashMapBased(
         auto input_it = thrust::make_transform_iterator(
             index_it,
             ::cuda::proclaim_return_type<
-                ::cuda::std::tuple<int64_t*, index_t, int32_t, bool, bool>>(
+                ::cuda::std::tuple<int64_t*, index_t, int32_t, bool>>(
                 [=, map = map.ref(cuco::find)] __device__(auto it)
-                    -> ::cuda::std::tuple<
-                        int64_t*, index_t, int32_t, bool, bool> {
+                    -> ::cuda::std::tuple<int64_t*, index_t, int32_t, bool> {
                   const auto i = it.key;
                   const auto tensor_index = it.value;
                   const auto tensor_offset = i - offsets_dev_ptr[tensor_index];
@@ -211,9 +212,7 @@ UniqueAndCompactBatchedHashMapBased(
                   auto slot = map.find(key);
                   const auto valid = slot->second == i;
 
-                  return {
-                      &slot->second, node_id, batch_index, valid,
-                      i == batch_offset};
+                  return {&slot->second, node_id, batch_index, valid};
                 }));
         torch::optional<torch::Tensor> part_ids;
         if (world_size > 1) {
@@ -223,8 +222,9 @@ UniqueAndCompactBatchedHashMapBased(
         }
         auto unique_ids =
             torch::empty(offsets_ptr[2 * num_batches], src_ids[0].options());
-        auto unique_ids_offsets_dev = torch::empty(
-            num_batches + 1, src_ids[0].options().dtype(torch::kInt64));
+        auto unique_ids_offsets_dev = torch::full(
+            num_batches + 1, std::numeric_limits<int64_t>::max(),
+            src_ids[0].options().dtype(torch::kInt64));
         auto unique_ids_offsets_dev_ptr =
             unique_ids_offsets_dev.data_ptr<int64_t>();
         auto output_it = thrust::make_tabulate_output_iterator(
@@ -237,16 +237,16 @@ UniqueAndCompactBatchedHashMapBased(
                      world_size)] __device__(const int64_t i, const auto& t) {
                   *::cuda::std::get<0>(t) = i;
                   const auto node_id = ::cuda::std::get<1>(t);
-                  const auto is_i_equal_batch_offset = ::cuda::std::get<4>(t);
                   unique_ids_ptr[i] = node_id;
                   if (part_ids_ptr) {
                     part_ids_ptr[i] =
                         cuda::rank_assignment(node_id, rank, world_size);
                   }
-                  if (is_i_equal_batch_offset) {
-                    const auto batch_index = ::cuda::std::get<2>(t);
-                    unique_ids_offsets_dev_ptr[batch_index] = i;
-                  }
+                  const auto batch_index = ::cuda::std::get<2>(t);
+                  auto ref =
+                      ::cuda::atomic_ref<int64_t, ::cuda::thread_scope_device>{
+                          unique_ids_offsets_dev_ptr[batch_index]};
+                  ref.fetch_min(i, ::cuda::memory_order_relaxed);
                 }));
         CUB_CALL(
             DeviceSelect::If, input_it, output_it,
@@ -259,10 +259,29 @@ UniqueAndCompactBatchedHashMapBased(
             num_batches + 1,
             c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true));
         auto unique_ids_offsets_ptr = unique_ids_offsets.data_ptr<int64_t>();
-        CUDA_CALL(cudaMemcpyAsync(
-            unique_ids_offsets_ptr, unique_ids_offsets_dev_ptr,
-            sizeof(int64_t) * (num_batches + 1), cudaMemcpyDeviceToHost,
-            stream));
+        {
+          auto unique_ids_offsets_dev2 =
+              torch::empty_like(unique_ids_offsets_dev);
+          CUB_CALL(
+              DeviceScan::InclusiveScan,
+              thrust::make_reverse_iterator(
+                  num_batches + 1 + unique_ids_offsets_dev_ptr),
+              thrust::make_reverse_iterator(
+                  num_batches + 1 +
+                  thrust::make_transform_output_iterator(
+                      thrust::make_zip_iterator(
+                          unique_ids_offsets_dev2.data_ptr<int64_t>(),
+                          unique_ids_offsets_ptr),
+                      ::cuda::proclaim_return_type<
+                          thrust::tuple<int64_t, int64_t>>(
+                          [=] __device__(const auto x) {
+                            return thrust::make_tuple(x, x);
+                          }))),
+              cub::Min{}, num_batches + 1);
+          unique_ids_offsets_dev = unique_ids_offsets_dev2;
+          unique_ids_offsets_dev_ptr =
+              unique_ids_offsets_dev.data_ptr<int64_t>();
+        }
         at::cuda::CUDAEvent unique_ids_offsets_event;
         unique_ids_offsets_event.record();
         torch::optional<torch::Tensor> index;

From 0734e33e0e0af8a6a54feb8e3a968e78d758af7c Mon Sep 17 00:00:00 2001
From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com>
Date: Thu, 5 Sep 2024 14:08:28 +0800
Subject: [PATCH 76/78] [DistGB] save as graphbolt graph directly after
 partition test case (#7724)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-126.us-west-2.compute.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-52-174.us-west-2.compute.internal>
---
 python/dgl/distributed/partition.py |   51 +-
 tests/distributed/test_partition.py | 1072 ++++++++++++++++++++++-----
 2 files changed, 905 insertions(+), 218 deletions(-)

diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 73ea48959597..07601fd5d2ca 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -109,28 +109,45 @@ def _save_graphs(filename, g_list, formats=None, sort_etypes=False):
     save_graphs(filename, g_list, formats=formats)
 
 
-def _get_inner_node_mask(graph, ntype_id):
-    if NTYPE in graph.ndata:
-        dtype = F.dtype(graph.ndata["inner_node"])
-        return (
-            graph.ndata["inner_node"]
-            * F.astype(graph.ndata[NTYPE] == ntype_id, dtype)
-            == 1
+def _get_inner_node_mask(graph, ntype_id, gpb=None):
+    ndata = (
+        graph.node_attributes
+        if isinstance(graph, gb.FusedCSCSamplingGraph)
+        else graph.ndata
+    )
+    assert "inner_node" in ndata, "'inner_node' is not in nodes' data"
+    if NTYPE in ndata or gpb is not None:
+        ntype = (
+            gpb.map_to_per_ntype(ndata[NID])[0]
+            if gpb is not None
+            else ndata[NTYPE]
         )
+        dtype = F.dtype(ndata["inner_node"])
+        return ndata["inner_node"] * F.astype(ntype == ntype_id, dtype) == 1
     else:
-        return graph.ndata["inner_node"] == 1
+        return ndata["inner_node"] == 1
 
 
-def _get_inner_edge_mask(graph, etype_id):
-    if ETYPE in graph.edata:
-        dtype = F.dtype(graph.edata["inner_edge"])
-        return (
-            graph.edata["inner_edge"]
-            * F.astype(graph.edata[ETYPE] == etype_id, dtype)
-            == 1
-        )
+def _get_inner_edge_mask(
+    graph,
+    etype_id,
+):
+    edata = (
+        graph.edge_attributes
+        if isinstance(graph, gb.FusedCSCSamplingGraph)
+        else graph.edata
+    )
+    assert "inner_edge" in edata, "'inner_edge' is not in edges' data"
+    etype = (
+        graph.type_per_edge
+        if isinstance(graph, gb.FusedCSCSamplingGraph)
+        else (graph.edata[ETYPE] if ETYPE in graph.edata else None)
+    )
+    if etype is not None:
+        dtype = F.dtype(edata["inner_edge"])
+        return edata["inner_edge"] * F.astype(etype == etype_id, dtype) == 1
     else:
-        return graph.edata["inner_edge"] == 1
+        return edata["inner_edge"] == 1
 
 
 def _get_part_ranges(id_ranges):
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index 5fb121750e01..32e2bdc4fea9 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -5,6 +5,7 @@
 import dgl
 
 import dgl.backend as F
+import dgl.graphbolt as gb
 import numpy as np
 import pytest
 import torch as th
@@ -36,11 +37,26 @@
 
 
 def _verify_partition_data_types(part_g):
+    """
+    check list:
+        make sure nodes and edges have correct type.
+    """
+    ndata = (
+        part_g.node_attributes
+        if isinstance(part_g, gb.FusedCSCSamplingGraph)
+        else part_g.ndata
+    )
+    edata = (
+        part_g.edge_attributes
+        if isinstance(part_g, gb.FusedCSCSamplingGraph)
+        else part_g.edata
+    )
+
     for k, dtype in RESERVED_FIELD_DTYPE.items():
-        if k in part_g.ndata:
-            assert part_g.ndata[k].dtype == dtype
-        if k in part_g.edata:
-            assert part_g.edata[k].dtype == dtype
+        if k in ndata:
+            assert ndata[k].dtype == dtype
+        if k in edata:
+            assert edata[k].dtype == dtype
 
 
 def _verify_partition_formats(part_g, formats):
@@ -81,98 +97,220 @@ def create_random_hetero():
     return dgl.heterograph(edges, num_nodes)
 
 
-def verify_hetero_graph(g, parts):
+def _verify_graphbolt_attributes(
+    parts, store_inner_node, store_inner_edge, store_eids
+):
+    """
+    check list:
+        make sure arguments work.
+    """
+    for part in parts:
+        assert store_inner_edge == ("inner_edge" in part.edge_attributes)
+        assert store_inner_node == ("inner_node" in part.node_attributes)
+        assert store_eids == (dgl.EID in part.edge_attributes)
+
+
+def _verify_hetero_graph_node_edge_num(
+    g,
+    parts,
+    store_inner_edge,
+    debug_mode,
+):
+    """
+    check list:
+        make sure edge type are correct.
+        make sure the number of nodes in each node type are correct.
+        make sure the number of nodes in each node type are correct.
+    """
     num_nodes = {ntype: 0 for ntype in g.ntypes}
     num_edges = {etype: 0 for etype in g.canonical_etypes}
     for part in parts:
-        assert len(g.ntypes) == len(F.unique(part.ndata[dgl.NTYPE]))
-        assert len(g.canonical_etypes) == len(F.unique(part.edata[dgl.ETYPE]))
-        for ntype in g.ntypes:
-            ntype_id = g.get_ntype_id(ntype)
-            inner_node_mask = _get_inner_node_mask(part, ntype_id)
-            num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0)
-            num_nodes[ntype] += num_inner_nodes
-        for etype in g.canonical_etypes:
-            etype_id = g.get_etype_id(etype)
-            inner_edge_mask = _get_inner_edge_mask(part, etype_id)
-            num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0)
-            num_edges[etype] += num_inner_edges
+        edata = (
+            part.edge_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.edata
+        )
+        if dgl.ETYPE in edata:
+            assert len(g.canonical_etypes) == len(F.unique(edata[dgl.ETYPE]))
+        if debug_mode or isinstance(part, dgl.DGLGraph):
+            for ntype in g.ntypes:
+                ntype_id = g.get_ntype_id(ntype)
+                inner_node_mask = _get_inner_node_mask(part, ntype_id)
+                num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0)
+                num_nodes[ntype] += num_inner_nodes
+        if store_inner_edge or isinstance(part, dgl.DGLGraph):
+            for etype in g.canonical_etypes:
+                etype_id = g.get_etype_id(etype)
+                inner_edge_mask = _get_inner_edge_mask(part, etype_id)
+                num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0)
+                num_edges[etype] += num_inner_edges
+
     # Verify the number of nodes are correct.
-    for ntype in g.ntypes:
-        print(
-            "node {}: {}, {}".format(
-                ntype, g.num_nodes(ntype), num_nodes[ntype]
+    if debug_mode or isinstance(part, dgl.DGLGraph):
+        for ntype in g.ntypes:
+            print(
+                "node {}: {}, {}".format(
+                    ntype, g.num_nodes(ntype), num_nodes[ntype]
+                )
             )
-        )
-        assert g.num_nodes(ntype) == num_nodes[ntype]
+            assert g.num_nodes(ntype) == num_nodes[ntype]
     # Verify the number of edges are correct.
-    for etype in g.canonical_etypes:
-        print(
-            "edge {}: {}, {}".format(
-                etype, g.num_edges(etype), num_edges[etype]
+    if store_inner_edge or isinstance(part, dgl.DGLGraph):
+        for etype in g.canonical_etypes:
+            print(
+                "edge {}: {}, {}".format(
+                    etype, g.num_edges(etype), num_edges[etype]
+                )
             )
+            assert g.num_edges(etype) == num_edges[etype]
+
+
+def _verify_edge_id_range_hetero(
+    g,
+    part,
+    eids,
+):
+    """
+    check list:
+        make sure inner_eids fall into a range.
+        make sure all edges are included.
+    """
+    edata = (
+        part.edge_attributes
+        if isinstance(part, gb.FusedCSCSamplingGraph)
+        else part.edata
+    )
+    etype = (
+        part.type_per_edge
+        if isinstance(part, gb.FusedCSCSamplingGraph)
+        else edata[dgl.ETYPE]
+    )
+    eid = th.arange(len(edata[dgl.EID]))
+    etype_arr = F.gather_row(etype, eid)
+    eid_arr = F.gather_row(edata[dgl.EID], eid)
+    for etype in g.canonical_etypes:
+        etype_id = g.get_etype_id(etype)
+        eids[etype].append(F.boolean_mask(eid_arr, etype_arr == etype_id))
+        # Make sure edge Ids fall into a range.
+        inner_edge_mask = _get_inner_edge_mask(part, etype_id)
+        inner_eids = np.sort(
+            F.asnumpy(F.boolean_mask(edata[dgl.EID], inner_edge_mask))
+        )
+        assert np.all(
+            inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1)
         )
-        assert g.num_edges(etype) == num_edges[etype]
+    return eids
 
-    nids = {ntype: [] for ntype in g.ntypes}
-    eids = {etype: [] for etype in g.canonical_etypes}
-    for part in parts:
-        _, _, eid = part.edges(form="all")
-        etype_arr = F.gather_row(part.edata[dgl.ETYPE], eid)
-        eid_type = F.gather_row(part.edata[dgl.EID], eid)
-        for etype in g.canonical_etypes:
-            etype_id = g.get_etype_id(etype)
-            eids[etype].append(F.boolean_mask(eid_type, etype_arr == etype_id))
-            # Make sure edge Ids fall into a range.
-            inner_edge_mask = _get_inner_edge_mask(part, etype_id)
-            inner_eids = np.sort(
-                F.asnumpy(F.boolean_mask(part.edata[dgl.EID], inner_edge_mask))
-            )
-            assert np.all(
-                inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1)
-            )
 
-        for ntype in g.ntypes:
-            ntype_id = g.get_ntype_id(ntype)
-            # Make sure inner nodes have Ids fall into a range.
-            inner_node_mask = _get_inner_node_mask(part, ntype_id)
-            inner_nids = F.boolean_mask(part.ndata[dgl.NID], inner_node_mask)
-            assert np.all(
-                F.asnumpy(
-                    inner_nids
-                    == F.arange(
-                        F.as_scalar(inner_nids[0]),
-                        F.as_scalar(inner_nids[-1]) + 1,
-                    )
+def _verify_node_id_range_hetero(g, part, nids):
+    """
+    check list:
+        make sure inner nodes have Ids fall into a range.
+    """
+    for ntype in g.ntypes:
+        ntype_id = g.get_ntype_id(ntype)
+        # Make sure inner nodes have Ids fall into a range.
+        inner_node_mask = _get_inner_node_mask(part, ntype_id)
+        inner_nids = F.boolean_mask(
+            part.node_attributes[dgl.NID], inner_node_mask
+        )
+        assert np.all(
+            F.asnumpy(
+                inner_nids
+                == F.arange(
+                    F.as_scalar(inner_nids[0]),
+                    F.as_scalar(inner_nids[-1]) + 1,
                 )
             )
-            nids[ntype].append(inner_nids)
-
-    for ntype in nids:
-        nids_type = F.cat(nids[ntype], 0)
-        uniq_ids = F.unique(nids_type)
-        # We should get all nodes.
-        assert len(uniq_ids) == g.num_nodes(ntype)
-    for etype in eids:
-        eids_type = F.cat(eids[etype], 0)
-        uniq_ids = F.unique(eids_type)
-        assert len(uniq_ids) == g.num_edges(etype)
-    # TODO(zhengda) this doesn't check 'part_id'
-
-
-def verify_graph_feats(
-    g, gpb, part, node_feats, edge_feats, orig_nids, orig_eids
+        )
+        nids[ntype].append(inner_nids)
+    return nids
+
+
+def _verify_graph_attributes_hetero(
+    g,
+    parts,
+    store_inner_edge,
+    store_inner_node,
+):
+    """
+    check list:
+        make sure edge ids fall into a range.
+        make sure inner nodes have Ids fall into a range.
+        make sure all nodes is included.
+        make sure all edges is included.
+    """
+    nids = {ntype: [] for ntype in g.ntypes}
+    eids = {etype: [] for etype in g.canonical_etypes}
+    # check edge id.
+    if store_inner_edge or isinstance(parts[0], dgl.DGLGraph):
+        for part in parts:
+            # collect eids
+            eids = _verify_edge_id_range_hetero(g, part, eids)
+        for etype in eids:
+            eids_type = F.cat(eids[etype], 0)
+            uniq_ids = F.unique(eids_type)
+            # We should get all nodes.
+            assert len(uniq_ids) == g.num_edges(etype)
+
+    # check node id.
+    if store_inner_node or isinstance(parts[0], dgl.DGLGraph):
+        for part in parts:
+            nids = _verify_node_id_range_hetero(g, part, nids)
+        for ntype in nids:
+            nids_type = F.cat(nids[ntype], 0)
+            uniq_ids = F.unique(nids_type)
+            # We should get all nodes.
+            assert len(uniq_ids) == g.num_nodes(ntype)
+
+
+def _verify_hetero_graph(
+    g,
+    parts,
+    store_eids=False,
+    store_inner_edge=False,
+    store_inner_node=False,
+    debug_mode=False,
 ):
+    _verify_hetero_graph_node_edge_num(
+        g,
+        parts,
+        store_inner_edge=store_inner_edge,
+        debug_mode=debug_mode,
+    )
+    if store_eids:
+        _verify_graph_attributes_hetero(
+            g,
+            parts,
+            store_inner_edge=store_inner_edge,
+            store_inner_node=store_inner_node,
+        )
+
+
+def _verify_node_feats(g, part, gpb, orig_nids, node_feats, is_homo=False):
     for ntype in g.ntypes:
+        ndata = (
+            part.node_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.ndata
+        )
         ntype_id = g.get_ntype_id(ntype)
-        inner_node_mask = _get_inner_node_mask(part, ntype_id)
-        inner_nids = F.boolean_mask(part.ndata[dgl.NID], inner_node_mask)
+        inner_node_mask = _get_inner_node_mask(
+            part,
+            ntype_id,
+            (gpb if isinstance(part, gb.FusedCSCSamplingGraph) else None),
+        )
+        inner_nids = F.boolean_mask(ndata[dgl.NID], inner_node_mask)
         ntype_ids, inner_type_nids = gpb.map_to_per_ntype(inner_nids)
         partid = gpb.nid2partid(inner_type_nids, ntype)
-        assert np.all(F.asnumpy(ntype_ids) == ntype_id)
-        assert np.all(F.asnumpy(partid) == gpb.partid)
-
-        orig_id = orig_nids[ntype][inner_type_nids]
+        if is_homo:
+            assert np.all(F.asnumpy(ntype_ids) == ntype_id)
+            assert np.all(F.asnumpy(partid) == gpb.partid)
+
+        if is_homo:
+            orig_id = orig_nids[inner_type_nids]
+        else:
+            orig_id = orig_nids[ntype][inner_type_nids]
         local_nids = gpb.nid2localnid(inner_type_nids, gpb.partid, ntype)
 
         for name in g.nodes[ntype].data:
@@ -182,16 +320,26 @@ def verify_graph_feats(
             ndata = F.gather_row(node_feats[ntype + "/" + name], local_nids)
             assert np.all(F.asnumpy(ndata == true_feats))
 
+
+def _verify_edge_feats(g, part, gpb, orig_eids, edge_feats, is_homo=False):
     for etype in g.canonical_etypes:
+        edata = (
+            part.edge_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.edata
+        )
         etype_id = g.get_etype_id(etype)
         inner_edge_mask = _get_inner_edge_mask(part, etype_id)
-        inner_eids = F.boolean_mask(part.edata[dgl.EID], inner_edge_mask)
+        inner_eids = F.boolean_mask(edata[dgl.EID], inner_edge_mask)
         etype_ids, inner_type_eids = gpb.map_to_per_etype(inner_eids)
         partid = gpb.eid2partid(inner_type_eids, etype)
         assert np.all(F.asnumpy(etype_ids) == etype_id)
         assert np.all(F.asnumpy(partid) == gpb.partid)
 
-        orig_id = orig_eids[etype][inner_type_eids]
+        if is_homo:
+            orig_id = orig_eids[inner_type_eids]
+        else:
+            orig_id = orig_eids[etype][inner_type_eids]
         local_eids = gpb.eid2localeid(inner_type_eids, gpb.partid, etype)
 
         for name in g.edges[etype].data:
@@ -199,11 +347,86 @@ def verify_graph_feats(
                 continue
             true_feats = F.gather_row(g.edges[etype].data[name], orig_id)
             edata = F.gather_row(
-                edge_feats[_etype_tuple_to_str(etype) + "/" + name], local_eids
+                edge_feats[_etype_tuple_to_str(etype) + "/" + name],
+                local_eids,
             )
             assert np.all(F.asnumpy(edata == true_feats))
 
 
+def verify_graph_feats_hetero_dgl(
+    g,
+    gpb,
+    part,
+    node_feats,
+    edge_feats,
+    orig_nids,
+    orig_eids,
+):
+    """
+    check list:
+        make sure the feats of nodes and edges are correct
+    """
+    _verify_node_feats(g, part, gpb, orig_nids, node_feats)
+
+    _verify_edge_feats(g, part, gpb, orig_eids, edge_feats)
+
+
+def verify_graph_feats_gb(
+    g,
+    gpbs,
+    parts,
+    tot_node_feats,
+    tot_edge_feats,
+    orig_nids,
+    orig_eids,
+    shuffled_labels,
+    shuffled_edata,
+    test_ntype,
+    test_etype,
+    store_inner_node=False,
+    store_inner_edge=False,
+    store_eids=False,
+    is_homo=False,
+):
+    """
+    check list:
+        make sure the feats of nodes and edges are correct
+    """
+    for part_id in range(len(parts)):
+        part = parts[part_id]
+        gpb = gpbs[part_id]
+        node_feats = tot_node_feats[part_id]
+        edge_feats = tot_edge_feats[part_id]
+        if store_inner_node:
+            _verify_node_feats(
+                g,
+                part,
+                gpb,
+                orig_nids,
+                node_feats,
+                is_homo=is_homo,
+            )
+        if store_inner_edge and store_eids:
+            _verify_edge_feats(
+                g,
+                part,
+                gpb,
+                orig_eids,
+                edge_feats,
+                is_homo=is_homo,
+            )
+
+    _verify_shuffled_labels_gb(
+        g,
+        shuffled_labels,
+        shuffled_edata,
+        orig_nids,
+        orig_eids,
+        test_ntype,
+        test_etype,
+    )
+
+
 def check_hetero_partition(
     hg,
     part_method,
@@ -245,7 +468,7 @@ def check_hetero_partition(
     shuffled_labels = []
     shuffled_elabels = []
     for i in range(num_parts):
-        part_g, node_feats, edge_feats, gpb, _, ntypes, etypes = load_partition(
+        part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
             "/tmp/partition/test.json", i, load_feats=load_feats
         )
         _verify_partition_data_types(part_g)
@@ -313,7 +536,7 @@ def check_hetero_partition(
             assert len(orig_eids1) == len(orig_eids2)
             assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2))
         parts.append(part_g)
-        verify_graph_feats(
+        verify_graph_feats_hetero_dgl(
             hg, gpb, part_g, node_feats, edge_feats, orig_nids, orig_eids
         )
 
@@ -321,8 +544,7 @@ def check_hetero_partition(
         shuffled_elabels.append(
             edge_feats[_etype_tuple_to_str(test_etype) + "/labels"]
         )
-    verify_hetero_graph(hg, parts)
-
+    _verify_hetero_graph(hg, parts)
     shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0))
     shuffled_elabels = F.asnumpy(F.cat(shuffled_elabels, 0))
     orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype)
@@ -789,8 +1011,6 @@ def test_dgl_partition_to_graphbolt_homo(
                     orig_g.ndata["inner_node"],
                     new_g.node_attributes["inner_node"],
                 )
-            else:
-                assert "inner_node" not in new_g.node_attributes
             if store_eids or debug_mode:
                 assert orig_g.edata[dgl.EID].dtype == th.int64
                 assert new_g.edge_attributes[dgl.EID].dtype == th.int64
@@ -798,8 +1018,6 @@ def test_dgl_partition_to_graphbolt_homo(
                     orig_g.edata[dgl.EID][orig_eids],
                     new_g.edge_attributes[dgl.EID],
                 )
-            else:
-                assert dgl.EID not in new_g.edge_attributes
             if store_inner_edge or debug_mode:
                 assert orig_g.edata["inner_edge"].dtype == th.uint8
                 assert new_g.edge_attributes["inner_edge"].dtype == th.uint8
@@ -807,8 +1025,6 @@ def test_dgl_partition_to_graphbolt_homo(
                     orig_g.edata["inner_edge"][orig_eids],
                     new_g.edge_attributes["inner_edge"],
                 )
-            else:
-                assert "inner_edge" not in new_g.edge_attributes
             assert new_g.type_per_edge is None
             assert new_g.node_type_to_id is None
             assert new_g.edge_type_to_id is None
@@ -915,16 +1131,12 @@ def test_dgl_partition_to_graphbolt_hetero(
                     orig_g.ndata["inner_node"],
                     new_g.node_attributes["inner_node"],
                 )
-            else:
-                assert "inner_node" not in new_g.node_attributes
             if debug_mode:
                 assert orig_g.ndata[dgl.NTYPE].dtype == th.int32
                 assert new_g.node_attributes[dgl.NTYPE].dtype == th.int8
                 assert th.equal(
                     orig_g.ndata[dgl.NTYPE], new_g.node_attributes[dgl.NTYPE]
                 )
-            else:
-                assert dgl.NTYPE not in new_g.node_attributes
             if store_eids or debug_mode:
                 assert orig_g.edata[dgl.EID].dtype == th.int64
                 assert new_g.edge_attributes[dgl.EID].dtype == th.int64
@@ -932,8 +1144,6 @@ def test_dgl_partition_to_graphbolt_hetero(
                     orig_g.edata[dgl.EID][orig_eids],
                     new_g.edge_attributes[dgl.EID],
                 )
-            else:
-                assert dgl.EID not in new_g.edge_attributes
             if store_inner_edge or debug_mode:
                 assert orig_g.edata["inner_edge"].dtype == th.uint8
                 assert new_g.edge_attributes["inner_edge"].dtype == th.uint8
@@ -941,8 +1151,6 @@ def test_dgl_partition_to_graphbolt_hetero(
                     orig_g.edata["inner_edge"],
                     new_g.edge_attributes["inner_edge"],
                 )
-            else:
-                assert "inner_edge" not in new_g.edge_attributes
             if debug_mode:
                 assert orig_g.edata[dgl.ETYPE].dtype == th.int32
                 assert new_g.edge_attributes[dgl.ETYPE].dtype == th.int8
@@ -950,8 +1158,6 @@ def test_dgl_partition_to_graphbolt_hetero(
                     orig_g.edata[dgl.ETYPE][orig_eids],
                     new_g.edge_attributes[dgl.ETYPE],
                 )
-            else:
-                assert dgl.ETYPE not in new_g.edge_attributes
             assert th.equal(
                 orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge
             )
@@ -1073,6 +1279,217 @@ def test_not_sorted_node_edge_map():
         assert gpb.local_etype_offset == [0, 500, 1100, 1800, 2600]
 
 
+def _get_part_IDs(part_g):
+    # These are partition-local IDs.
+    num_columns = part_g.csc_indptr.diff()
+    part_src_ids = part_g.indices
+    part_dst_ids = th.arange(part_g.total_num_nodes).repeat_interleave(
+        num_columns
+    )
+    # These are reshuffled global homogeneous IDs.
+    part_src_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_src_ids)
+    part_dst_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_dst_ids)
+    return part_src_ids, part_dst_ids
+
+
+def _verify_orig_edge_IDs_gb(
+    g,
+    orig_nids,
+    orig_eids,
+    part_eids,
+    part_src_ids,
+    part_dst_ids,
+    src_ntype=None,
+    dst_ntype=None,
+    etype=None,
+):
+    """
+    check list:
+        make sure orig edge id are correct after
+    """
+    if src_ntype is not None and dst_ntype is not None:
+        orig_src_nid = orig_nids[src_ntype]
+        orig_dst_nid = orig_nids[dst_ntype]
+    else:
+        orig_src_nid = orig_nids
+        orig_dst_nid = orig_nids
+    orig_src_ids = F.gather_row(orig_src_nid, part_src_ids)
+    orig_dst_ids = F.gather_row(orig_dst_nid, part_dst_ids)
+    if etype is not None:
+        orig_eids = orig_eids[etype]
+    orig_eids1 = F.gather_row(orig_eids, part_eids)
+    orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids, etype=etype)
+    assert len(orig_eids1) == len(orig_eids2)
+    assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2))
+
+
+def _verify_metadata_gb(gpb, g, num_parts, part_id, part_sizes):
+    """
+    check list:
+        make sure the number of nodes and edges is correct.
+        make sure the number of parts is correct.
+        make sure the number of nodes and edges in each parts os corrcet.
+    """
+    assert gpb._num_nodes() == g.num_nodes()
+    assert gpb._num_edges() == g.num_edges()
+
+    assert gpb.num_partitions() == num_parts
+    gpb_meta = gpb.metadata()
+    assert len(gpb_meta) == num_parts
+    assert len(gpb.partid2nids(part_id)) == gpb_meta[part_id]["num_nodes"]
+    assert len(gpb.partid2eids(part_id)) == gpb_meta[part_id]["num_edges"]
+    part_sizes.append(
+        (gpb_meta[part_id]["num_nodes"], gpb_meta[part_id]["num_edges"])
+    )
+
+
+def _verify_local_id_gb(part_g, part_id, gpb):
+    """
+    check list:
+        make sure the type of local id is correct.
+        make sure local id have a right order.
+    """
+    nid = F.boolean_mask(
+        part_g.node_attributes[dgl.NID],
+        part_g.node_attributes["inner_node"],
+    )
+    local_nid = gpb.nid2localnid(nid, part_id)
+    assert F.dtype(local_nid) in (F.int64, F.int32)
+    assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid)))
+    eid = F.boolean_mask(
+        part_g.edge_attributes[dgl.EID],
+        part_g.edge_attributes["inner_edge"],
+    )
+    local_eid = gpb.eid2localeid(eid, part_id)
+    assert F.dtype(local_eid) in (F.int64, F.int32)
+    assert np.all(np.sort(F.asnumpy(local_eid)) == np.arange(0, len(local_eid)))
+    return local_nid, local_eid
+
+
+def _verify_map_gb(
+    part_g,
+    part_id,
+    gpb,
+):
+    """
+    check list:
+        make sure the map node and its data type is correct.
+    """
+    # Check the node map.
+    local_nodes = F.boolean_mask(
+        part_g.node_attributes[dgl.NID],
+        part_g.node_attributes["inner_node"],
+    )
+    inner_node_index = F.nonzero_1d(part_g.node_attributes["inner_node"])
+    mapping_nodes = gpb.partid2nids(part_id)
+    assert F.dtype(mapping_nodes) in (F.int32, F.int64)
+    assert np.all(
+        np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy(mapping_nodes))
+    )
+    assert np.all(
+        F.asnumpy(inner_node_index) == np.arange(len(inner_node_index))
+    )
+
+    # Check the edge map.
+
+    local_edges = F.boolean_mask(
+        part_g.edge_attributes[dgl.EID],
+        part_g.edge_attributes["inner_edge"],
+    )
+    inner_edge_index = F.nonzero_1d(part_g.edge_attributes["inner_edge"])
+    mapping_edges = gpb.partid2eids(part_id)
+    assert F.dtype(mapping_edges) in (F.int32, F.int64)
+    assert np.all(
+        np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy(mapping_edges))
+    )
+    assert np.all(
+        F.asnumpy(inner_edge_index) == np.arange(len(inner_edge_index))
+    )
+    return local_nodes, local_edges
+
+
+def _verify_local_and_map_id_gb(
+    part_g,
+    part_id,
+    gpb,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+):
+    """
+    check list:
+        make sure local id are correct.
+        make sure mapping id are correct.
+    """
+    if store_inner_node and store_inner_edge and store_eids:
+        _verify_local_id_gb(part_g, part_id, gpb)
+        _verify_map_gb(part_g, part_id, gpb)
+
+
+def _verify_orig_IDs_gb(
+    part_g,
+    gpb,
+    g,
+    is_homo=False,
+    part_src_ids=None,
+    part_dst_ids=None,
+    src_ntype_ids=None,
+    dst_ntype_ids=None,
+    orig_nids=None,
+    orig_eids=None,
+):
+    """
+    check list:
+        make sure orig edge id are correct.
+        make sure hetero ntype id are correct.
+    """
+    part_eids = part_g.edge_attributes[dgl.EID]
+    if is_homo:
+        _verify_orig_edge_IDs_gb(
+            g, orig_nids, orig_eids, part_eids, part_src_ids, part_dst_ids
+        )
+        local_orig_nids = orig_nids[part_g.node_attributes[dgl.NID]]
+        local_orig_eids = orig_eids[part_g.edge_attributes[dgl.EID]]
+        part_g.node_attributes["feats"] = F.gather_row(
+            g.ndata["feats"], local_orig_nids
+        )
+        part_g.edge_attributes["feats"] = F.gather_row(
+            g.edata["feats"], local_orig_eids
+        )
+    else:
+        etype_ids, part_eids = gpb.map_to_per_etype(part_eids)
+        # `IdMap` is in int64 by default.
+        assert etype_ids.dtype == F.int64
+
+        # These are original per-type IDs.
+        for etype_id, etype in enumerate(g.canonical_etypes):
+            part_src_ids1 = F.boolean_mask(part_src_ids, etype_ids == etype_id)
+            src_ntype_ids1 = F.boolean_mask(
+                src_ntype_ids, etype_ids == etype_id
+            )
+            part_dst_ids1 = F.boolean_mask(part_dst_ids, etype_ids == etype_id)
+            dst_ntype_ids1 = F.boolean_mask(
+                dst_ntype_ids, etype_ids == etype_id
+            )
+            part_eids1 = F.boolean_mask(part_eids, etype_ids == etype_id)
+            assert np.all(F.asnumpy(src_ntype_ids1 == src_ntype_ids1[0]))
+            assert np.all(F.asnumpy(dst_ntype_ids1 == dst_ntype_ids1[0]))
+            src_ntype = g.ntypes[F.as_scalar(src_ntype_ids1[0])]
+            dst_ntype = g.ntypes[F.as_scalar(dst_ntype_ids1[0])]
+
+            _verify_orig_edge_IDs_gb(
+                g,
+                orig_nids,
+                orig_eids,
+                part_eids1,
+                part_src_ids1,
+                part_dst_ids1,
+                src_ntype,
+                dst_ntype,
+                etype,
+            )
+
+
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
 @pytest.mark.parametrize("store_eids", [True, False])
@@ -1093,7 +1510,15 @@ def test_partition_graph_graphbolt_homo(
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_graph(1000)
         graph_name = "test"
-        partition_graph(
+        g.ndata["labels"] = F.arange(0, g.num_nodes())
+        g.ndata["feats"] = F.tensor(
+            np.random.randn(g.num_nodes(), 10), F.float32
+        )
+        g.edata["feats"] = F.tensor(
+            np.random.randn(g.num_edges(), 10), F.float32
+        )
+
+        orig_nids, orig_eids = partition_graph(
             g,
             graph_name,
             num_parts,
@@ -1103,46 +1528,305 @@ def test_partition_graph_graphbolt_homo(
             store_eids=store_eids,
             store_inner_node=store_inner_node,
             store_inner_edge=store_inner_edge,
+            return_mapping=True,
         )
-        part_config = os.path.join(test_dir, f"{graph_name}.json")
-        for part_id in range(num_parts):
-            orig_g = dgl.load_graphs(
-                os.path.join(test_dir, f"part{part_id}/graph.dgl")
-            )[0][0]
-            new_g = load_partition(
-                part_config, part_id, load_feats=False, use_graphbolt=True
-            )[0]
-            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
-            assert th.equal(orig_indptr, new_g.csc_indptr)
-            assert th.equal(orig_indices, new_g.indices)
-            assert new_g.node_type_offset is None
-            assert th.equal(
-                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
-            )
-            if store_inner_node or debug_mode:
-                assert th.equal(
-                    orig_g.ndata["inner_node"],
-                    new_g.node_attributes["inner_node"],
-                )
-            else:
-                assert "inner_node" not in new_g.node_attributes
-            if store_eids or debug_mode:
-                assert th.equal(
-                    orig_g.edata[dgl.EID][orig_eids],
-                    new_g.edge_attributes[dgl.EID],
-                )
-            else:
-                assert dgl.EID not in new_g.edge_attributes
-            if store_inner_edge or debug_mode:
-                assert th.equal(
-                    orig_g.edata["inner_edge"][orig_eids],
-                    new_g.edge_attributes["inner_edge"],
-                )
-            else:
-                assert "inner_edge" not in new_g.edge_attributes
-            assert new_g.type_per_edge is None
-            assert new_g.node_type_to_id is None
-            assert new_g.edge_type_to_id is None
+
+        if debug_mode:
+            store_eids = store_inner_node = store_inner_edge = True
+
+        _verify_graphbolt_part(
+            g,
+            test_dir,
+            orig_nids,
+            orig_eids,
+            graph_name,
+            num_parts,
+            store_inner_node,
+            store_inner_edge,
+            store_eids,
+            is_homo=True,
+        )
+
+
+def _verify_constructed_id_gb(part_sizes, gpb):
+    """
+    verify the part id of each node by constructed nids.
+    check list:
+        make sure each node' part id and its type are corect
+    """
+    node_map = []
+    edge_map = []
+    for part_i, (num_nodes, num_edges) in enumerate(part_sizes):
+        node_map.append(np.ones(num_nodes) * part_i)
+        edge_map.append(np.ones(num_edges) * part_i)
+    node_map = np.concatenate(node_map)
+    edge_map = np.concatenate(edge_map)
+    nid2pid = gpb.nid2partid(F.arange(0, len(node_map)))
+    assert F.dtype(nid2pid) in (F.int32, F.int64)
+    assert np.all(F.asnumpy(nid2pid) == node_map)
+    eid2pid = gpb.eid2partid(F.arange(0, len(edge_map)))
+    assert F.dtype(eid2pid) in (F.int32, F.int64)
+    assert np.all(F.asnumpy(eid2pid) == edge_map)
+
+
+def _verify_shuffled_labels_gb(
+    g,
+    shuffled_labels,
+    shuffled_edata,
+    orig_nids,
+    orig_eids,
+    test_ntype=None,
+    test_etype=None,
+):
+    """
+    check list:
+        make sure node data are correct.
+        make sure edge data are correct.
+    """
+    shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0))
+    shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0))
+    orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype)
+    orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype)
+
+    orig_nid = orig_nids if test_ntype is None else orig_nids[test_ntype]
+    orig_eid = orig_eids if test_etype is None else orig_eids[test_etype]
+    nlabel = (
+        g.ndata["labels"]
+        if test_ntype is None
+        else g.nodes[test_ntype].data["labels"]
+    )
+    edata = (
+        g.edata["feats"]
+        if test_etype is None
+        else g.edges[test_etype].data["labels"]
+    )
+
+    orig_labels[F.asnumpy(orig_nid)] = shuffled_labels
+    orig_edata[F.asnumpy(orig_eid)] = shuffled_edata
+    assert np.all(orig_labels == F.asnumpy(nlabel))
+    assert np.all(orig_edata == F.asnumpy(edata))
+
+
+def _verify_node_type_ID_gb(part_g, gpb):
+    """
+    check list:
+        make sure ntype id have correct data type
+    """
+    part_src_ids, part_dst_ids = _get_part_IDs(part_g)
+    # These are reshuffled per-type IDs.
+    src_ntype_ids, part_src_ids = gpb.map_to_per_ntype(part_src_ids)
+    dst_ntype_ids, part_dst_ids = gpb.map_to_per_ntype(part_dst_ids)
+    # `IdMap` is in int64 by default.
+    assert src_ntype_ids.dtype == F.int64
+    assert dst_ntype_ids.dtype == F.int64
+
+    with pytest.raises(dgl.utils.internal.InconsistentDtypeException):
+        gpb.map_to_per_ntype(F.tensor([0], F.int32))
+    with pytest.raises(dgl.utils.internal.InconsistentDtypeException):
+        gpb.map_to_per_etype(F.tensor([0], F.int32))
+    return (
+        part_src_ids,
+        part_dst_ids,
+        src_ntype_ids,
+        part_src_ids,
+        dst_ntype_ids,
+    )
+
+
+def _verify_IDs_gb(
+    g,
+    part_g,
+    part_id,
+    gpb,
+    part_sizes,
+    orig_nids,
+    orig_eids,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+    is_homo,
+):
+    # verify local id and mapping id
+    _verify_local_and_map_id_gb(
+        part_g,
+        part_id,
+        gpb,
+        store_inner_node,
+        store_inner_edge,
+        store_eids,
+    )
+
+    # Verify the mapping between the reshuffled IDs and the original IDs.
+    (
+        part_src_ids,
+        part_dst_ids,
+        src_ntype_ids,
+        part_src_ids,
+        dst_ntype_ids,
+    ) = _verify_node_type_ID_gb(part_g, gpb)
+
+    if store_eids:
+        _verify_orig_IDs_gb(
+            part_g,
+            gpb,
+            g,
+            part_src_ids=part_src_ids,
+            part_dst_ids=part_dst_ids,
+            src_ntype_ids=src_ntype_ids,
+            dst_ntype_ids=dst_ntype_ids,
+            orig_nids=orig_nids,
+            orig_eids=orig_eids,
+            is_homo=is_homo,
+        )
+    _verify_constructed_id_gb(part_sizes, gpb)
+
+
+def _collect_data_gb(
+    parts,
+    part_g,
+    gpbs,
+    gpb,
+    tot_node_feats,
+    node_feats,
+    tot_edge_feats,
+    edge_feats,
+    shuffled_labels,
+    shuffled_edata,
+    test_ntype,
+    test_etype,
+):
+    if test_ntype != None:
+        shuffled_labels.append(node_feats[test_ntype + "/labels"])
+        shuffled_edata.append(
+            edge_feats[_etype_tuple_to_str(test_etype) + "/labels"]
+        )
+    else:
+        shuffled_labels.append(node_feats["_N/labels"])
+        shuffled_edata.append(edge_feats["_N:_E:_N/feats"])
+    parts.append(part_g)
+    gpbs.append(gpb)
+    tot_node_feats.append(node_feats)
+    tot_edge_feats.append(edge_feats)
+
+
+def _verify_graphbolt_part(
+    g,
+    test_dir,
+    orig_nids,
+    orig_eids,
+    graph_name,
+    num_parts,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+    test_ntype=None,
+    test_etype=None,
+    is_homo=False,
+):
+    """
+    check list:
+        _verify_metadata_gb:
+            data type, ID's order and ID's number of edges and nodes
+        _verify_IDs_gb:
+            local id, mapping id,node type id, orig edge, hetero ntype id
+        verify_graph_feats_gb:
+            nodes and edges' feats
+        _verify_graphbolt_attributes:
+            arguments
+    """
+    parts = []
+    tot_node_feats = []
+    tot_edge_feats = []
+    shuffled_labels = []
+    shuffled_edata = []
+    part_sizes = []
+    gpbs = []
+    part_config = os.path.join(test_dir, f"{graph_name}.json")
+    # test each part
+    for part_id in range(num_parts):
+        part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+            part_config, part_id, load_feats=True, use_graphbolt=True
+        )
+        # verify metadata
+        _verify_metadata_gb(
+            gpb,
+            g,
+            num_parts,
+            part_id,
+            part_sizes,
+        )
+
+        # verify eid and nid
+        _verify_IDs_gb(
+            g,
+            part_g,
+            part_id,
+            gpb,
+            part_sizes,
+            orig_nids,
+            orig_eids,
+            store_inner_node,
+            store_inner_edge,
+            store_eids,
+            is_homo,
+        )
+
+        # collect shuffled data and parts
+        _collect_data_gb(
+            parts,
+            part_g,
+            gpbs,
+            gpb,
+            tot_node_feats,
+            node_feats,
+            tot_edge_feats,
+            edge_feats,
+            shuffled_labels,
+            shuffled_edata,
+            test_ntype,
+            test_etype,
+        )
+
+    # verify graph feats
+    verify_graph_feats_gb(
+        g,
+        gpbs,
+        parts,
+        tot_node_feats,
+        tot_edge_feats,
+        orig_nids,
+        orig_eids,
+        shuffled_labels=shuffled_labels,
+        shuffled_edata=shuffled_edata,
+        test_ntype=test_ntype,
+        test_etype=test_etype,
+        store_inner_node=store_inner_node,
+        store_inner_edge=store_inner_edge,
+        store_eids=store_eids,
+        is_homo=is_homo,
+    )
+
+    _verify_graphbolt_attributes(
+        parts, store_inner_node, store_inner_edge, store_eids
+    )
+
+    return parts
+
+
+def _verify_original_IDs_type_hetero(hg, orig_nids, orig_eids):
+    """
+    check list:
+        make sure type of nodes and edges' ids are correct.
+        make sure nodes and edges' number in each type is correct.
+    """
+    assert len(orig_nids) == len(hg.ntypes)
+    assert len(orig_eids) == len(hg.canonical_etypes)
+    for ntype in hg.ntypes:
+        assert len(orig_nids[ntype]) == hg.num_nodes(ntype)
+        assert F.dtype(orig_nids[ntype]) in (F.int64, F.int32)
+    for etype in hg.canonical_etypes:
+        assert len(orig_eids[etype]) == hg.num_edges(etype)
+        assert F.dtype(orig_eids[etype]) in (F.int64, F.int32)
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
@@ -1160,81 +1844,67 @@ def test_partition_graph_graphbolt_hetero(
     debug_mode,
     n_jobs=1,
 ):
+    test_ntype = "n1"
+    test_etype = ("n1", "r1", "n2")
     reset_envs()
     if debug_mode:
         os.environ["DGL_DIST_DEBUG"] = "1"
     with tempfile.TemporaryDirectory() as test_dir:
-        g = create_random_hetero()
+        hg = create_random_hetero()
         graph_name = "test"
-        partition_graph(
-            g,
+        hg.nodes[test_ntype].data["labels"] = F.arange(
+            0, hg.num_nodes(test_ntype)
+        )
+        hg.nodes[test_ntype].data["feats"] = F.tensor(
+            np.random.randn(hg.num_nodes(test_ntype), 10), F.float32
+        )
+        hg.edges[test_etype].data["feats"] = F.tensor(
+            np.random.randn(hg.num_edges(test_etype), 10), F.float32
+        )
+        hg.edges[test_etype].data["labels"] = F.arange(
+            0, hg.num_edges(test_etype)
+        )
+        orig_nids, orig_eids = partition_graph(
+            hg,
             graph_name,
             num_parts,
             test_dir,
             part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=1,
             use_graphbolt=True,
             store_eids=store_eids,
             store_inner_node=store_inner_node,
             store_inner_edge=store_inner_edge,
             n_jobs=n_jobs,
         )
-        part_config = os.path.join(test_dir, f"{graph_name}.json")
-        for part_id in range(num_parts):
-            orig_g = dgl.load_graphs(
-                os.path.join(test_dir, f"part{part_id}/graph.dgl")
-            )[0][0]
-            new_g = load_partition(
-                part_config, part_id, load_feats=False, use_graphbolt=True
-            )[0]
-            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
-            assert th.equal(orig_indptr, new_g.csc_indptr)
-            assert th.equal(orig_indices, new_g.indices)
-            assert th.equal(
-                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
-            )
-            if store_inner_node or debug_mode:
-                assert th.equal(
-                    orig_g.ndata["inner_node"],
-                    new_g.node_attributes["inner_node"],
-                )
-            else:
-                assert "inner_node" not in new_g.node_attributes
-            if debug_mode:
-                assert th.equal(
-                    orig_g.ndata[dgl.NTYPE], new_g.node_attributes[dgl.NTYPE]
-                )
-            else:
-                assert dgl.NTYPE not in new_g.node_attributes
-            if store_eids or debug_mode:
-                assert th.equal(
-                    orig_g.edata[dgl.EID][orig_eids],
-                    new_g.edge_attributes[dgl.EID],
-                )
-            else:
-                assert dgl.EID not in new_g.edge_attributes
-            if store_inner_edge or debug_mode:
-                assert th.equal(
-                    orig_g.edata["inner_edge"],
-                    new_g.edge_attributes["inner_edge"],
-                )
-            else:
-                assert "inner_edge" not in new_g.edge_attributes
-            if debug_mode:
-                assert th.equal(
-                    orig_g.edata[dgl.ETYPE][orig_eids],
-                    new_g.edge_attributes[dgl.ETYPE],
-                )
-            else:
-                assert dgl.ETYPE not in new_g.edge_attributes
-            assert th.equal(
-                orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge
-            )
 
-            for node_type, type_id in new_g.node_type_to_id.items():
-                assert g.get_ntype_id(node_type) == type_id
-            for edge_type, type_id in new_g.edge_type_to_id.items():
-                assert g.get_etype_id(_etype_str_to_tuple(edge_type)) == type_id
-            assert new_g.node_type_offset is None
+        _verify_original_IDs_type_hetero(hg, orig_nids, orig_eids)
+        if debug_mode:
+            store_eids = store_inner_node = store_inner_edge = True
+
+        parts = _verify_graphbolt_part(
+            hg,
+            test_dir,
+            orig_nids,
+            orig_eids,
+            graph_name,
+            num_parts,
+            store_inner_node,
+            store_inner_edge,
+            store_eids,
+            test_ntype,
+            test_etype,
+            is_homo=False,
+        )
+
+        _verify_hetero_graph(
+            hg,
+            parts,
+            store_eids=store_eids,
+            store_inner_edge=store_inner_edge,
+            debug_mode=debug_mode,
+        )
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])

From f71427f33f6026470509a30af170ac5d2315b2ba Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 5 Sep 2024 02:59:25 -0400
Subject: [PATCH 77/78] [GraphBolt][CUDA] Return partition offsets to be
 utilized in `all_to_all`. (#7775)

---
 .../cuda/cooperative_minibatching_utils.cu    | 48 +++++++++++++-----
 .../src/cuda/cooperative_minibatching_utils.h | 22 +++++---
 .../src/cuda/extension/unique_and_compact.h   |  3 +-
 .../cuda/extension/unique_and_compact_map.cu  | 26 ++++++----
 graphbolt/src/cuda/sampling_utils.cu          |  2 +-
 graphbolt/src/cuda/unique_and_compact_impl.cu |  9 +++-
 graphbolt/src/cuda/utils.h                    | 50 -------------------
 7 files changed, 78 insertions(+), 82 deletions(-)

diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 8a632bb809b4..49403128a7f1 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -49,10 +49,12 @@ torch::Tensor RankAssignment(
   return part_ids;
 }
 
-std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, at::cuda::CUDAEvent>
+RankSortImpl(
     torch::Tensor nodes, torch::Tensor part_ids, torch::Tensor offsets_dev,
     const int64_t world_size) {
   const int num_bits = cuda::NumberOfBits(world_size);
+  const auto num_batches = offsets_dev.numel() - 1;
   auto offsets_dev_ptr = offsets_dev.data_ptr<int64_t>();
   auto part_ids_sorted = torch::empty_like(part_ids);
   auto part_ids2 = part_ids.clone();
@@ -60,27 +62,47 @@ std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
   auto nodes_sorted = torch::empty_like(nodes);
   auto index = torch::arange(nodes.numel(), nodes.options());
   auto index_sorted = torch::empty_like(index);
-  AT_DISPATCH_INDEX_TYPES(
+  return AT_DISPATCH_INDEX_TYPES(
       nodes.scalar_type(), "RankSortImpl", ([&] {
         CUB_CALL(
             DeviceSegmentedRadixSort::SortPairs,
             part_ids.data_ptr<cuda::part_t>(),
             part_ids_sorted.data_ptr<cuda::part_t>(), nodes.data_ptr<index_t>(),
-            nodes_sorted.data_ptr<index_t>(), nodes.numel(),
-            offsets_dev.numel() - 1, offsets_dev_ptr, offsets_dev_ptr + 1, 0,
-            num_bits);
+            nodes_sorted.data_ptr<index_t>(), nodes.numel(), num_batches,
+            offsets_dev_ptr, offsets_dev_ptr + 1, 0, num_bits);
+        auto offsets = torch::empty(
+            num_batches * world_size + 1, c10::TensorOptions()
+                                              .dtype(offsets_dev.scalar_type())
+                                              .pinned_memory(true));
+        CUB_CALL(
+            DeviceFor::Bulk, num_batches * world_size + 1,
+            [=, part_ids = part_ids_sorted.data_ptr<cuda::part_t>(),
+             offsets = offsets.data_ptr<int64_t>()] __device__(int64_t i) {
+              const auto batch_id = i / world_size;
+              const auto rank = i % world_size;
+              const auto offset_begin = offsets_dev_ptr[batch_id];
+              const auto offset_end =
+                  offsets_dev_ptr[::cuda::std::min(batch_id + 1, num_batches)];
+              offsets[i] = cub::LowerBound(
+                               part_ids + offset_begin,
+                               offset_end - offset_begin, rank) +
+                           offset_begin;
+            });
+        at::cuda::CUDAEvent offsets_event;
+        offsets_event.record();
         CUB_CALL(
             DeviceSegmentedRadixSort::SortPairs,
             part_ids2.data_ptr<cuda::part_t>(),
             part_ids2_sorted.data_ptr<cuda::part_t>(),
             index.data_ptr<index_t>(), index_sorted.data_ptr<index_t>(),
-            nodes.numel(), offsets_dev.numel() - 1, offsets_dev_ptr,
-            offsets_dev_ptr + 1, 0, num_bits);
+            nodes.numel(), num_batches, offsets_dev_ptr, offsets_dev_ptr + 1, 0,
+            num_bits);
+        return std::make_tuple(
+            nodes_sorted, index_sorted, offsets, std::move(offsets_event));
       }));
-  return {nodes_sorted, index_sorted};
 }
 
-std::vector<std::tuple<torch::Tensor, torch::Tensor>> RankSort(
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
     std::vector<torch::Tensor>& nodes_list, const int64_t rank,
     const int64_t world_size) {
   const auto num_batches = nodes_list.size();
@@ -100,13 +122,15 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor>> RankSort(
       offsets_dev.data_ptr<int64_t>(), offsets_ptr,
       sizeof(int64_t) * offsets.numel(), cudaMemcpyHostToDevice,
       cuda::GetCurrentStream()));
-  auto [nodes_sorted, index_sorted] =
+  auto [nodes_sorted, index_sorted, rank_offsets, rank_offsets_event] =
       RankSortImpl(nodes, part_ids, offsets_dev, world_size);
-  std::vector<std::tuple<torch::Tensor, torch::Tensor>> results;
+  std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> results;
+  rank_offsets_event.synchronize();
   for (int64_t i = 0; i < num_batches; i++) {
     results.emplace_back(
         nodes_sorted.slice(0, offsets_ptr[i], offsets_ptr[i + 1]),
-        index_sorted.slice(0, offsets_ptr[i], offsets_ptr[i + 1]));
+        index_sorted.slice(0, offsets_ptr[i], offsets_ptr[i + 1]),
+        rank_offsets.slice(0, i * world_size, (i + 1) * world_size + 1));
   }
   return results;
 }
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
index ba9357063b72..cd20138a01c9 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.h
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -74,11 +74,15 @@ torch::Tensor RankAssignment(
  * @param offsets_dev  Offsets to separate different node types.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return (sorted_nodes, original_positions), where the first
- * one includes sorted nodes, the second contains original positions of the
- * sorted nodes.
+ * @return (sorted_nodes, original_positions, rank_offsets, rank_offsets_event),
+ * where the first one includes sorted nodes, the second contains original
+ * positions of the sorted nodes and the third contains the offsets of the
+ * sorted_nodes indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]]
+ * contains nodes that belongs to the `i`th rank. Before accessing rank_offsets
+ * on the CPU, `rank_offsets_event.synchronize()` is required.
  */
-std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, at::cuda::CUDAEvent>
+RankSortImpl(
     torch::Tensor nodes, torch::Tensor part_ids, torch::Tensor offsets_dev,
     int64_t world_size);
 
@@ -91,11 +95,13 @@ std::pair<torch::Tensor, torch::Tensor> RankSortImpl(
  * @param rank         Rank of the current GPU.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return vector of (sorted_nodes, original_positions), where the first
- * one includes sorted nodes, the second contains original positions of the
- * sorted nodes.
+ * @return vector of (sorted_nodes, original_positions, rank_offsets), where the
+ * first one includes sorted nodes, the second contains original positions of
+ * the sorted nodes and the third contains the offsets of the sorted_nodes
+ * indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes
+ * that belongs to the `i`th rank.
  */
-std::vector<std::tuple<torch::Tensor, torch::Tensor>> RankSort(
+std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
     std::vector<torch::Tensor>& nodes_list, int64_t rank, int64_t world_size);
 
 }  // namespace cuda
diff --git a/graphbolt/src/cuda/extension/unique_and_compact.h b/graphbolt/src/cuda/extension/unique_and_compact.h
index c68168e24fce..7a7173897758 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact.h
+++ b/graphbolt/src/cuda/extension/unique_and_compact.h
@@ -28,7 +28,8 @@
 namespace graphbolt {
 namespace ops {
 
-std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> >
+std::vector<
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatchedHashMapBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index 48e9941a7f77..ed030fd98718 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -106,7 +106,8 @@ __global__ void _MapIdsBatched(
   }
 }
 
-std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+std::vector<
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>>
 UniqueAndCompactBatchedHashMapBased(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
@@ -258,7 +259,6 @@ UniqueAndCompactBatchedHashMapBased(
         auto unique_ids_offsets = torch::empty(
             num_batches + 1,
             c10::TensorOptions().dtype(torch::kInt64).pinned_memory(true));
-        auto unique_ids_offsets_ptr = unique_ids_offsets.data_ptr<int64_t>();
         {
           auto unique_ids_offsets_dev2 =
               torch::empty_like(unique_ids_offsets_dev);
@@ -271,7 +271,7 @@ UniqueAndCompactBatchedHashMapBased(
                   thrust::make_transform_output_iterator(
                       thrust::make_zip_iterator(
                           unique_ids_offsets_dev2.data_ptr<int64_t>(),
-                          unique_ids_offsets_ptr),
+                          unique_ids_offsets.data_ptr<int64_t>()),
                       ::cuda::proclaim_return_type<
                           thrust::tuple<int64_t, int64_t>>(
                           [=] __device__(const auto x) {
@@ -283,11 +283,14 @@ UniqueAndCompactBatchedHashMapBased(
               unique_ids_offsets_dev.data_ptr<int64_t>();
         }
         at::cuda::CUDAEvent unique_ids_offsets_event;
-        unique_ids_offsets_event.record();
         torch::optional<torch::Tensor> index;
         if (part_ids) {
-          std::tie(unique_ids, index) = cuda::RankSortImpl(
-              unique_ids, *part_ids, unique_ids_offsets_dev, world_size);
+          std::tie(
+              unique_ids, index, unique_ids_offsets, unique_ids_offsets_event) =
+              cuda::RankSortImpl(
+                  unique_ids, *part_ids, unique_ids_offsets_dev, world_size);
+        } else {
+          unique_ids_offsets_event.record();
         }
         auto mapped_ids =
             torch::empty(offsets_ptr[3 * num_batches], unique_ids.options());
@@ -297,18 +300,23 @@ UniqueAndCompactBatchedHashMapBased(
             pointers_dev_ptr, offsets_dev_ptr, unique_ids_offsets_dev_ptr,
             index ? index->data_ptr<index_t>() : nullptr, map.ref(cuco::find),
             mapped_ids.data_ptr<index_t>());
-        std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+        std::vector<std::tuple<
+            torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>>
             results;
         unique_ids_offsets_event.synchronize();
+        auto unique_ids_offsets_ptr = unique_ids_offsets.data_ptr<int64_t>();
         for (int64_t i = 0; i < num_batches; i++) {
           results.emplace_back(
               unique_ids.slice(
-                  0, unique_ids_offsets_ptr[i], unique_ids_offsets_ptr[i + 1]),
+                  0, unique_ids_offsets_ptr[i * world_size],
+                  unique_ids_offsets_ptr[(i + 1) * world_size]),
               mapped_ids.slice(
                   0, offsets_ptr[2 * i + 1], offsets_ptr[2 * i + 2]),
               mapped_ids.slice(
                   0, offsets_ptr[2 * num_batches + i],
-                  offsets_ptr[2 * num_batches + i + 1]));
+                  offsets_ptr[2 * num_batches + i + 1]),
+              unique_ids_offsets.slice(
+                  0, i * world_size, (i + 1) * world_size + 1));
         }
         return results;
       }));
diff --git a/graphbolt/src/cuda/sampling_utils.cu b/graphbolt/src/cuda/sampling_utils.cu
index f40de5e5656f..5df92f6e7649 100644
--- a/graphbolt/src/cuda/sampling_utils.cu
+++ b/graphbolt/src/cuda/sampling_utils.cu
@@ -106,7 +106,7 @@ struct EdgeTypeSearch {
     const auto indptr_i = sub_indptr[homo_i];
     const auto degree = sub_indptr[homo_i + 1] - indptr_i;
     const etype_t etype = i % num_fanouts;
-    auto offset = cuda::LowerBound(etypes + indptr_i, degree, etype);
+    auto offset = cub::LowerBound(etypes + indptr_i, degree, etype);
     new_sub_indptr[i] = indptr_i + offset;
     new_sliced_indptr[i] = sliced_indptr[homo_i] + offset;
     if (i == num_rows - 1) new_sub_indptr[num_rows] = indptr_i + degree;
diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu
index c8b5775c5b47..a630b78e149c 100644
--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
@@ -282,8 +282,15 @@ UniqueAndCompactBatched(
     // Utilizes a hash table based implementation, the mapped id of a vertex
     // will be monotonically increasing as the first occurrence index of it in
     // torch.cat([unique_dst_ids, src_ids]). Thus, it is deterministic.
-    return UniqueAndCompactBatchedHashMapBased(
+    auto results4 = UniqueAndCompactBatchedHashMapBased(
         src_ids, dst_ids, unique_dst_ids, rank, world_size);
+    std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+        results3;
+    // TODO @mfbalin: expose the `d` result in a later PR.
+    for (const auto& [a, b, c, d] : results4) {
+      results3.emplace_back(a, b, c);
+    }
+    return results3;
   }
   TORCH_CHECK(
       world_size <= 1,
diff --git a/graphbolt/src/cuda/utils.h b/graphbolt/src/cuda/utils.h
index 05f5ffbb2c8c..ed2078a05a24 100644
--- a/graphbolt/src/cuda/utils.h
+++ b/graphbolt/src/cuda/utils.h
@@ -61,56 +61,6 @@ int NumberOfBits(const T& range) {
   return bits;
 }
 
-/**
- * @brief Given a sorted array and a value this function returns the index
- * of the first element which compares greater than or equal to value.
- *
- * This function assumes 0-based index
- * @param A: ascending sorted array
- * @param n: size of the A
- * @param x: value to search in A
- * @return index, i, of the first element st. A[i]>=x. If x>A[n-1] returns n.
- * if x<A[0] then it returns 0.
- */
-template <typename indptr_t, typename indices_t>
-__device__ indices_t LowerBound(const indptr_t* A, indices_t n, indptr_t x) {
-  indices_t l = 0, r = n;
-  while (l < r) {
-    const auto m = l + (r - l) / 2;
-    if (x > A[m]) {
-      l = m + 1;
-    } else {
-      r = m;
-    }
-  }
-  return l;
-}
-
-/**
- * @brief Given a sorted array and a value this function returns the index
- * of the first element which compares greater than value.
- *
- * This function assumes 0-based index
- * @param A: ascending sorted array
- * @param n: size of the A
- * @param x: value to search in A
- * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n.
- * if x<A[0] then it returns 0.
- */
-template <typename indptr_t, typename indices_t>
-__device__ indices_t UpperBound(const indptr_t* A, indices_t n, indptr_t x) {
-  indices_t l = 0, r = n;
-  while (l < r) {
-    const auto m = l + (r - l) / 2;
-    if (x >= A[m]) {
-      l = m + 1;
-    } else {
-      r = m;
-    }
-  }
-  return l;
-}
-
 }  // namespace cuda
 }  // namespace graphbolt
 

From 32b11c98e57692807564f084394c541e6ba812ac Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 5 Sep 2024 17:47:42 -0400
Subject: [PATCH 78/78] [GraphBolt][CUDA] Expose `RankSort` to python,
 reorganize and test. (#7776)

---
 .../cuda/cooperative_minibatching_utils.cu    |  7 ++-
 .../cuda/cooperative_minibatching_utils.cuh   | 55 +++++++++++++++++
 .../src/cuda/cooperative_minibatching_utils.h | 28 +--------
 .../cuda/extension/unique_and_compact_map.cu  |  1 +
 graphbolt/src/python_binding.cc               |  3 +-
 .../test_cooperative_minibatching_utils.py    | 59 +++++++++++++++++++
 6 files changed, 125 insertions(+), 28 deletions(-)
 create mode 100644 graphbolt/src/cuda/cooperative_minibatching_utils.cuh
 create mode 100644 tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py

diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index 49403128a7f1..fb9858f6d559 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -18,12 +18,14 @@
  * @brief Cooperative Minibatching (arXiv:2310.12403) utility function
  * implementations in CUDA.
  */
+#include <graphbolt/cuda_ops.h>
 #include <thrust/transform.h>
 
 #include <cub/cub.cuh>
 #include <cuda/functional>
 
 #include "./common.h"
+#include "./cooperative_minibatching_utils.cuh"
 #include "./cooperative_minibatching_utils.h"
 #include "./utils.h"
 
@@ -60,7 +62,8 @@ RankSortImpl(
   auto part_ids2 = part_ids.clone();
   auto part_ids2_sorted = torch::empty_like(part_ids2);
   auto nodes_sorted = torch::empty_like(nodes);
-  auto index = torch::arange(nodes.numel(), nodes.options());
+  auto index = ops::IndptrEdgeIdsImpl(
+      offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
   auto index_sorted = torch::empty_like(index);
   return AT_DISPATCH_INDEX_TYPES(
       nodes.scalar_type(), "RankSortImpl", ([&] {
@@ -103,7 +106,7 @@ RankSortImpl(
 }
 
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
-    std::vector<torch::Tensor>& nodes_list, const int64_t rank,
+    const std::vector<torch::Tensor>& nodes_list, const int64_t rank,
     const int64_t world_size) {
   const auto num_batches = nodes_list.size();
   auto nodes = torch::cat(nodes_list, 0);
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cuh b/graphbolt/src/cuda/cooperative_minibatching_utils.cuh
new file mode 100644
index 000000000000..f5acc20a1f77
--- /dev/null
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cuh
@@ -0,0 +1,55 @@
+/**
+ *   Copyright (c) 2024, mfbalin (Muhammed Fatih Balin)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file cuda/cooperative_minibatching_utils.cuh
+ * @brief Cooperative Minibatching (arXiv:2310.12403) utility device functions
+ * in CUDA.
+ */
+#ifndef GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_
+#define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_
+
+#include <curand_kernel.h>
+
+namespace graphbolt {
+namespace cuda {
+
+using part_t = uint8_t;
+constexpr auto kPartDType = torch::kUInt8;
+
+/**
+ * @brief Given a vertex id, the rank of current GPU and the world size, returns
+ * the rank that this id belongs in a deterministic manner.
+ *
+ * @param id         The node id that will mapped to a rank in [0, world_size).
+ * @param rank       The rank of the current GPU.
+ * @param world_size The world size, the total number of cooperating GPUs.
+ *
+ * @return The rank of the GPU the given id is mapped to.
+ */
+template <typename index_t>
+__device__ inline auto rank_assignment(
+    index_t id, uint32_t rank, uint32_t world_size) {
+  // Consider using a faster implementation in the future.
+  constexpr uint64_t kCurandSeed = 999961;  // Any random number.
+  curandStatePhilox4_32_10_t rng;
+  curand_init(kCurandSeed, 0, id, &rng);
+  return (curand(&rng) - rank) % world_size;
+}
+
+}  // namespace cuda
+}  // namespace graphbolt
+
+#endif  // GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_CUH_
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
index cd20138a01c9..45bd203f1f71 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.h
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -21,35 +21,12 @@
 #ifndef GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
 #define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
 
-#include <curand_kernel.h>
+#include <ATen/cuda/CUDAEvent.h>
 #include <torch/script.h>
 
 namespace graphbolt {
 namespace cuda {
 
-using part_t = uint8_t;
-constexpr auto kPartDType = torch::kUInt8;
-
-/**
- * @brief Given a vertex id, the rank of current GPU and the world size, returns
- * the rank that this id belongs in a deterministic manner.
- *
- * @param id         The node id that will mapped to a rank in [0, world_size).
- * @param rank       The rank of the current GPU.
- * @param world_size The world size, the total number of cooperating GPUs.
- *
- * @return The rank of the GPU the given id is mapped to.
- */
-template <typename index_t>
-__device__ inline auto rank_assignment(
-    index_t id, uint32_t rank, uint32_t world_size) {
-  // Consider using a faster implementation in the future.
-  constexpr uint64_t kCurandSeed = 999961;  // Any random number.
-  curandStatePhilox4_32_10_t rng;
-  curand_init(kCurandSeed, 0, id, &rng);
-  return (curand(&rng) - rank) % world_size;
-}
-
 /**
  * @brief Given node ids, the rank of current GPU and the world size, returns
  * the ranks that the given ids belong in a deterministic manner.
@@ -102,7 +79,8 @@ RankSortImpl(
  * that belongs to the `i`th rank.
  */
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
-    std::vector<torch::Tensor>& nodes_list, int64_t rank, int64_t world_size);
+    const std::vector<torch::Tensor>& nodes_list, int64_t rank,
+    int64_t world_size);
 
 }  // namespace cuda
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index ed030fd98718..a36c63925d7f 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -33,6 +33,7 @@
 #include <numeric>
 
 #include "../common.h"
+#include "../cooperative_minibatching_utils.cuh"
 #include "../cooperative_minibatching_utils.h"
 #include "../utils.h"
 #include "./unique_and_compact.h"
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index ea2b543761cf..20c6d59be5d5 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -10,10 +10,10 @@
 #include <graphbolt/unique_and_compact.h>
 
 #ifdef GRAPHBOLT_USE_CUDA
+#include "./cuda/cooperative_minibatching_utils.h"
 #include "./cuda/max_uva_threads.h"
 #endif
 #include "./cnumpy.h"
-#include "./expand_indptr.h"
 #include "./feature_cache.h"
 #include "./index_select.h"
 #include "./io_uring.h"
@@ -196,6 +196,7 @@ TORCH_LIBRARY(graphbolt, m) {
   m.def("set_seed", &RandomEngine::SetManualSeed);
 #ifdef GRAPHBOLT_USE_CUDA
   m.def("set_max_uva_threads", &cuda::set_max_uva_threads);
+  m.def("rank_sort", &cuda::RankSort);
 #endif
 #ifdef HAS_IMPL_ABSTRACT_PYSTUB
   m.impl_abstract_pystub("dgl.graphbolt.base", "//dgl.graphbolt.base");
diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
new file mode 100644
index 000000000000..f85676578bd5
--- /dev/null
+++ b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
@@ -0,0 +1,59 @@
+import unittest
+
+from functools import partial
+
+import backend as F
+import dgl.graphbolt as gb
+import pytest
+import torch
+
+WORLD_SIZE = 7
+
+assert_equal = partial(torch.testing.assert_close, rtol=0, atol=0)
+
+
+@unittest.skipIf(
+    F._default_context_str != "gpu",
+    reason="This test requires an NVIDIA GPU.",
+)
+@pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
+@pytest.mark.parametrize("rank", list(range(WORLD_SIZE)))
+def test_gpu_cached_feature_read_async(dtype, rank):
+    nodes_list1 = [
+        torch.randint(0, 11111111, [777], dtype=dtype, device=F.ctx())
+        for i in range(10)
+    ]
+    nodes_list2 = [nodes.sort()[0] for nodes in nodes_list1]
+
+    res1 = torch.ops.graphbolt.rank_sort(nodes_list1, rank, WORLD_SIZE)
+    res2 = torch.ops.graphbolt.rank_sort(nodes_list2, rank, WORLD_SIZE)
+
+    for i, ((nodes1, idx1, offsets1), (nodes2, idx2, offsets2)) in enumerate(
+        zip(res1, res2)
+    ):
+        assert_equal(nodes_list1[i], nodes1[idx1.sort()[1]])
+        assert_equal(nodes_list2[i], nodes2[idx2.sort()[1]])
+        assert_equal(offsets1, offsets2)
+        assert offsets1.is_pinned() and offsets2.is_pinned()
+
+    res3 = torch.ops.graphbolt.rank_sort(nodes_list1, rank, WORLD_SIZE)
+
+    # This function is deterministic. Call with identical arguments and check.
+    for (nodes1, idx1, offsets1), (nodes3, idx3, offsets3) in zip(res1, res3):
+        assert_equal(nodes1, nodes3)
+        assert_equal(idx1, idx3)
+        assert_equal(offsets1, offsets3)
+
+    # The dependency on the rank argument is simply a permutation.
+    res4 = torch.ops.graphbolt.rank_sort(nodes_list1, 0, WORLD_SIZE)
+    for (nodes1, idx1, offsets1), (nodes4, idx4, offsets4) in zip(res1, res4):
+        off1 = offsets1.tolist()
+        off4 = offsets4.tolist()
+        for i in range(WORLD_SIZE):
+            j = (i - rank + WORLD_SIZE) % WORLD_SIZE
+            assert_equal(
+                nodes1[off1[j] : off1[j + 1]], nodes4[off4[i] : off4[i + 1]]
+            )
+            assert_equal(
+                idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]]
+            )