From 53e70c57fbbf17c2549feb4572ebf8cd7126c987 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 11 Sep 2024 17:46:50 -0400
Subject: [PATCH 01/10] [GraphBolt][CUDA] Cooperative Minibatching initial
 exchange. (#7795)

---
 python/dgl/graphbolt/internal/sample_utils.py |   1 +
 python/dgl/graphbolt/subgraph_sampler.py      | 193 ++++++++++++++++--
 .../graphbolt/impl/test_neighbor_sampler.py   |   4 +-
 3 files changed, 184 insertions(+), 14 deletions(-)

diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py
index e88ea0193a55..f499694d6d72 100644
--- a/python/dgl/graphbolt/internal/sample_utils.py
+++ b/python/dgl/graphbolt/internal/sample_utils.py
@@ -349,6 +349,7 @@ def wait(self):
             if is_homogeneous:
                 compacted_csc_formats = list(compacted_csc_formats.values())[0]
                 unique_nodes = list(unique_nodes.values())[0]
+                offsets = list(offsets.values())[0]
 
             return unique_nodes, compacted_csc_formats, offsets
 
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index e26ec3336be6..88fdd3808714 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -5,10 +5,12 @@
 from typing import Dict
 
 import torch
+import torch.distributed as thd
 from torch.utils.data import functional_datapipe
 
 from .base import seed_type_str_to_ntypes
 from .internal import compact_temporal_nodes, unique_and_compact
+from .minibatch import MiniBatch
 from .minibatch_transformer import MiniBatchTransformer
 
 __all__ = [
@@ -28,6 +30,25 @@ def wait(self):
         return result
 
 
+def _shift(inputs: list, group=None):
+    cutoff = len(inputs) - thd.get_rank(group)
+    return inputs[cutoff:] + inputs[:cutoff]
+
+
+def all_to_all(outputs, inputs, group=None, async_op=False):
+    """Wrapper for thd.all_to_all that permuted outputs and inputs before
+    calling it. The arguments have the permutation
+    `rank, ..., world_size - 1, 0, ..., rank - 1` and we make it
+    `0, world_size - 1` before calling `thd.all_to_all`."""
+    shift_fn = partial(_shift, group=group)
+    return thd.all_to_all(shift_fn(outputs), shift_fn(inputs), group, async_op)
+
+
+def _revert_to_homo(d: dict):
+    is_homogenous = len(d) == 1 and "_N" in d
+    return list(d.values())[0] if is_homogenous else d
+
+
 @functional_datapipe("sample_subgraph")
 class SubgraphSampler(MiniBatchTransformer):
     """A subgraph sampler used to sample a subgraph from a given set of nodes
@@ -49,8 +70,8 @@ class SubgraphSampler(MiniBatchTransformer):
         Arguments to be passed into sampling_stages.
     kwargs : Keyword Arguments
         Arguments to be passed into sampling_stages. Preprocessing stage makes
-        use of the `asynchronous` parameter before it is passed to
-        the sampling stages.
+        use of the `asynchronous` and `cooperative` parameters before they are
+        passed to the sampling stages.
     """
 
     def __init__(
@@ -60,10 +81,22 @@ def __init__(
         **kwargs,
     ):
         async_op = kwargs.get("asynchronous", False)
-        preprocess_fn = partial(self._preprocess, async_op=async_op)
+        cooperative = kwargs.get("cooperative", False)
+        preprocess_fn = partial(
+            self._preprocess, cooperative=cooperative, async_op=async_op
+        )
         datapipe = datapipe.transform(preprocess_fn)
         if async_op:
-            datapipe = datapipe.buffer().transform(self._wait_preprocess_future)
+            fn = partial(self._wait_preprocess_future, cooperative=cooperative)
+            datapipe = datapipe.buffer().transform(fn)
+        if cooperative:
+            datapipe = datapipe.transform(self._seeds_cooperative_exchange_1)
+            datapipe = datapipe.buffer()
+            datapipe = datapipe.transform(self._seeds_cooperative_exchange_2)
+            datapipe = datapipe.buffer()
+            datapipe = datapipe.transform(self._seeds_cooperative_exchange_3)
+            datapipe = datapipe.buffer()
+            datapipe = datapipe.transform(self._seeds_cooperative_exchange_4)
         datapipe = self.sampling_stages(datapipe, *args, **kwargs)
         datapipe = datapipe.transform(self._postprocess)
         super().__init__(datapipe)
@@ -75,12 +108,16 @@ def _postprocess(minibatch):
         return minibatch
 
     @staticmethod
-    def _preprocess(minibatch, async_op: bool):
+    def _preprocess(minibatch, cooperative: bool, async_op: bool):
         if minibatch.seeds is None:
             raise ValueError(
                 f"Invalid minibatch {minibatch}: `seeds` should have a value."
             )
-        results = SubgraphSampler._seeds_preprocess(minibatch, async_op)
+        rank = thd.get_rank() if cooperative else 0
+        world_size = thd.get_world_size() if cooperative else 1
+        results = SubgraphSampler._seeds_preprocess(
+            minibatch, rank, world_size, async_op
+        )
         if async_op:
             minibatch._preprocess_future = results
         else:
@@ -88,17 +125,125 @@ def _preprocess(minibatch, async_op: bool):
                 minibatch._seed_nodes,
                 minibatch._seeds_timestamp,
                 minibatch.compacted_seeds,
+                offsets,
             ) = results
+            if cooperative:
+                minibatch._seeds_offsets = offsets
         return minibatch
 
     @staticmethod
-    def _wait_preprocess_future(minibatch):
+    def _wait_preprocess_future(minibatch, cooperative: bool):
         (
             minibatch._seed_nodes,
             minibatch._seeds_timestamp,
             minibatch.compacted_seeds,
+            offsets,
         ) = minibatch._preprocess_future.wait()
         delattr(minibatch, "_preprocess_future")
+        if cooperative:
+            minibatch._seeds_offsets = offsets
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_1(minibatch, group=None):
+        rank = thd.get_rank(group)
+        world_size = thd.get_world_size(group)
+        assert world_size > 1
+        seeds = minibatch._seed_nodes
+        is_homogeneous = not isinstance(seeds, dict)
+        if is_homogeneous:
+            seeds = {"_N": seeds}
+        if minibatch._seeds_offsets is None:
+            seeds_list = list(seeds.values())
+            (
+                sorted_seeds_list,
+                index_list,
+                offsets_list,
+            ) = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size)
+            assert minibatch.compacted_seeds is None
+            sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {}
+            num_ntypes = len(seeds.keys())
+            for i, (
+                seed_type,
+                typed_sorted_seeds,
+                typed_index,
+                typed_offsets,
+            ) in enumerate(
+                zip(
+                    seeds.keys(),
+                    sorted_seeds_list,
+                    index_list,
+                    offsets_list,
+                )
+            ):
+                sorted_seeds[seed_type] = typed_sorted_seeds
+                sorted_compacted[seed_type] = typed_index
+                sorted_offsets[seed_type] = typed_offsets.tolist()
+
+            minibatch._seed_nodes = sorted_seeds
+            minibatch.compacted_seeds = sorted_compacted
+            minibatch._seeds_offsets = sorted_offsets
+        else:
+            minibatch._seeds_offsets = {"_N": minibatch._seeds_offsets}
+        counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64)
+        for i, offsets in enumerate(minibatch._seeds_offsets[0].values()):
+            counts_sent[
+                torch.arange(i, world_size * num_ntypes, num_ntypes)
+            ] = offsets.diff()
+        delattr(minibatch, "_seeds_offsets")
+        counts_received = torch.empty_like(counts_sent)
+        minibatch._counts_future = all_to_all(
+            counts_received.split(num_ntypes),
+            counts_sent.split(num_ntypes),
+            group=group,
+            async_op=True,
+        )
+        minibatch._counts_sent = counts_sent
+        minibatch._counts_received = counts_received
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_2(minibatch, group=None):
+        world_size = thd.get_world_size(group)
+        seeds = minibatch._seed_nodes
+        minibatch._counts_future.wait()
+        delattr(minibatch, "_counts_future")
+        counts_received = minibatch._counts_received
+        num_ntypes = len(seeds.keys())
+        seeds_received = {}
+        counts_sent = {}
+        counts_received = {}
+        for i, (ntype, typed_seeds) in enumerate(seeds.items()):
+            idx = torch.arange(i, world_size * num_ntypes, num_ntypes)
+            typed_counts_sent = minibatch._counts_sent[idx].tolist()
+            typed_counts_received = minibatch._counts_received[idx].tolist()
+            typed_seeds_received = typed_seeds.new_empty(
+                sum(typed_counts_received)
+            )
+            all_to_all(
+                typed_seeds_received.split(typed_counts_received),
+                typed_seeds.split(typed_counts_sent),
+                group,
+            )
+            seeds_received[ntype] = typed_seeds_received
+        minibatch._seed_nodes = _revert_to_homo(seeds_received)
+        minibatch._counts_sent = _revert_to_homo(counts_sent)
+        minibatch._counts_received = _revert_to_homo(counts_received)
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_3(minibatch):
+        minibatch._unique_future = unique_and_compact(
+            minibatch._seed_nodes, 0, 1, async_op=True
+        )
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_4(minibatch):
+        unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait()
+        delattr(minibatch, "_unique_future")
+        minibatch._seed_nodes = _revert_to_homo(unique_seeds)
+        minibatch._seed_inverse_ids = _revert_to_homo(inverse_seeds)
         return minibatch
 
     def _sample(self, minibatch):
@@ -119,7 +264,12 @@ def sampling_stages(self, datapipe):
         return datapipe.transform(self._sample)
 
     @staticmethod
-    def _seeds_preprocess(minibatch, async_op):
+    def _seeds_preprocess(
+        minibatch: MiniBatch,
+        rank: int = 0,
+        world_size: int = 1,
+        async_op: bool = False,
+    ):
         """Preprocess `seeds` in a minibatch to construct `unique_seeds`,
         `node_timestamp` and `compacted_seeds` for further sampling. It
         optionally incorporates timestamps for temporal graphs, organizing and
@@ -130,6 +280,11 @@ def _seeds_preprocess(minibatch, async_op):
         ----------
         minibatch: MiniBatch
             The minibatch.
+        rank : int
+            The rank of the current process among cooperating processes.
+        world_size : int
+            The number of cooperating
+            (`arXiv:2210.13339<https://arxiv.org/abs/2310.12403>`__) processes.
         async_op: bool
             Boolean indicating whether the call is asynchronous. If so, the
             result can be obtained by calling wait on the returned future.
@@ -145,8 +300,16 @@ def _seeds_preprocess(minibatch, async_op):
         compacted_seeds: torch.tensor or a Dict[str, torch.Tensor]
             Representation of compacted seeds corresponding to 'seeds', where
             all node ids inside are compacted.
+        offsets: None or torch.Tensor or Dict[src, torch.Tensor]
+            The unique nodes offsets tensor partitions the unique_nodes tensor.
+            Has size `world_size + 1` and
+            `unique_nodes[offsets[i]: offsets[i + 1]]` belongs to the rank
+            `(rank + i) % world_size`.
         """
         use_timestamp = hasattr(minibatch, "timestamp")
+        assert (
+            not use_timestamp or world_size == 1
+        ), "Temporal code path does not currently support Cooperative Minibatching"
         seeds = minibatch.seeds
         is_heterogeneous = isinstance(seeds, Dict)
         if is_heterogeneous:
@@ -164,7 +327,7 @@ def _seeds_preprocess(minibatch, async_op):
                         if hasattr(minibatch, "timestamp")
                         else None
                     )
-                    result = _NoOpWaiter((seeds, nodes_timestamp, None))
+                    result = _NoOpWaiter((seeds, nodes_timestamp, None, None))
                     break
                 result = None
                 assert typed_seeds.ndim == 2, (
@@ -200,7 +363,7 @@ def __init__(self, nodes, nodes_timestamp, seeds):
                         )
                     else:
                         self.future = unique_and_compact(
-                            nodes, async_op=async_op
+                            nodes, rank, world_size, async_op
                         )
                     self.seeds = seeds
 
@@ -208,8 +371,9 @@ def wait(self):
                     """Returns the stored value when invoked."""
                     if use_timestamp:
                         unique_seeds, nodes_timestamp, compacted = self.future
+                        offsets = None
                     else:
-                        unique_seeds, compacted, _ = (
+                        unique_seeds, compacted, offsets = (
                             self.future.wait() if async_op else self.future
                         )
                         nodes_timestamp = None
@@ -234,6 +398,7 @@ def wait(self):
                         unique_seeds,
                         nodes_timestamp,
                         compacted_seeds,
+                        offsets,
                     )
 
             # When typed_seeds is not a one-dimensional tensor
@@ -248,7 +413,7 @@ def wait(self):
                     if hasattr(minibatch, "timestamp")
                     else None
                 )
-                result = _NoOpWaiter((seeds, nodes_timestamp, None))
+                result = _NoOpWaiter((seeds, nodes_timestamp, None, None))
             else:
                 # Collect nodes from all types of input.
                 nodes = [seeds.view(-1)]
@@ -289,8 +454,9 @@ def wait(self):
                                 nodes_timestamp,
                                 compacted,
                             ) = self.future
+                            offsets = None
                         else:
-                            unique_seeds, compacted, _ = (
+                            unique_seeds, compacted, offsets = (
                                 self.future.wait() if async_op else self.future
                             )
                             nodes_timestamp = None
@@ -305,6 +471,7 @@ def wait(self):
                             unique_seeds,
                             nodes_timestamp,
                             compacted_seeds,
+                            offsets,
                         )
 
                 result = _Waiter(nodes, nodes_timestamp, seeds)
diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
index 4a095ac1b10d..5326e620b0c3 100644
--- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -65,7 +65,9 @@ def test_NeighborSampler_GraphFetch(
         graph.type_per_edge = None
     item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx())
     fanout = torch.LongTensor([2])
-    preprocess_fn = partial(gb.SubgraphSampler._preprocess, async_op=False)
+    preprocess_fn = partial(
+        gb.SubgraphSampler._preprocess, cooperative=False, async_op=False
+    )
     datapipe = item_sampler.map(preprocess_fn)
     datapipe = datapipe.map(
         partial(gb.NeighborSampler._prepare, graph.node_type_to_id)

From 165e2507e78bb50760d6a9e16cf2dc31b986c27a Mon Sep 17 00:00:00 2001
From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com>
Date: Thu, 12 Sep 2024 16:55:33 +0800
Subject: [PATCH 02/10] [DistGB] modify partition.py to save
 FusedCSCSamplingGraph directly (#7727)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-126.us-west-2.compute.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-52-174.us-west-2.compute.internal>
Co-authored-by: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
---
 python/dgl/distributed/partition.py | 535 +++++++++++++++++++---------
 1 file changed, 371 insertions(+), 164 deletions(-)

diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 08343df5505a..079ed8806a96 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -88,24 +88,26 @@ def _dump_part_config(part_config, part_metadata):
         json.dump(part_metadata, outfile, sort_keys=False, indent=4)
 
 
-def _save_graphs(filename, g_list, formats=None, sort_etypes=False):
+def process_partitions(g, formats=None, sort_etypes=False):
     """Preprocess partitions before saving:
     1. format data types.
     2. sort csc/csr by tag.
     """
-    for g in g_list:
-        for k, dtype in RESERVED_FIELD_DTYPE.items():
-            if k in g.ndata:
-                g.ndata[k] = F.astype(g.ndata[k], dtype)
-            if k in g.edata:
-                g.edata[k] = F.astype(g.edata[k], dtype)
-    for g in g_list:
-        if (not sort_etypes) or (formats is None):
-            continue
+    for k, dtype in RESERVED_FIELD_DTYPE.items():
+        if k in g.ndata:
+            g.ndata[k] = F.astype(g.ndata[k], dtype)
+        if k in g.edata:
+            g.edata[k] = F.astype(g.edata[k], dtype)
+
+    if (sort_etypes) and (formats is not None):
         if "csr" in formats:
             g = sort_csr_by_tag(g, tag=g.edata[ETYPE], tag_type="edge")
         if "csc" in formats:
             g = sort_csc_by_tag(g, tag=g.edata[ETYPE], tag_type="edge")
+    return g
+
+
+def _save_dgl_graphs(filename, g_list, formats=None):
     save_graphs(filename, g_list, formats=formats)
 
 
@@ -332,9 +334,10 @@ def load_partition(part_config, part_id, load_feats=True, use_graphbolt=False):
         "part-{}".format(part_id) in part_metadata
     ), "part-{} does not exist".format(part_id)
     part_files = part_metadata["part-{}".format(part_id)]
-    part_graph_field = "part_graph"
     if use_graphbolt:
         part_graph_field = "part_graph_graphbolt"
+    else:
+        part_graph_field = "part_graph"
     assert (
         part_graph_field in part_files
     ), f"the partition does not contain graph structure: {part_graph_field}"
@@ -461,7 +464,7 @@ def load_partition_feats(
     return node_feats, edge_feats
 
 
-def load_partition_book(part_config, part_id):
+def load_partition_book(part_config, part_id, part_metadata=None):
     """Load a graph partition book from the partition config file.
 
     Parameters
@@ -470,6 +473,8 @@ def load_partition_book(part_config, part_id):
         The path of the partition config file.
     part_id : int
         The partition ID.
+    part_metadata : dict
+        The meta data of partition.
 
     Returns
     -------
@@ -482,7 +487,8 @@ def load_partition_book(part_config, part_id):
     dict
         The edge types
     """
-    part_metadata = _load_part_config(part_config)
+    if part_metadata is None:
+        part_metadata = _load_part_config(part_config)
     assert "num_parts" in part_metadata, "num_parts does not exist."
     assert (
         part_metadata["num_parts"] > part_id
@@ -666,6 +672,38 @@ def _set_trainer_ids(g, sim_g, node_parts):
             g.edges[c_etype].data["trainer_id"] = trainer_id
 
 
+def _partition_to_graphbolt(
+    parts,
+    part_i,
+    part_config,
+    part_metadata,
+    *,
+    store_eids=True,
+    store_inner_node=False,
+    store_inner_edge=False,
+    graph_formats=None,
+):
+    gpb, _, ntypes, etypes = load_partition_book(
+        part_config=part_config, part_id=part_i, part_metadata=part_metadata
+    )
+    graph = parts[part_i]
+    csc_graph = gb_convert_single_dgl_partition(
+        ntypes=ntypes,
+        etypes=etypes,
+        gpb=gpb,
+        part_meta=part_metadata,
+        graph=graph,
+        store_eids=store_eids,
+        store_inner_edge=store_inner_edge,
+        store_inner_node=store_inner_node,
+        graph_formats=graph_formats,
+    )
+    rel_path_result = _save_graph_gb(
+        part_config=part_config, part_id=part_i, csc_graph=csc_graph
+    )
+    part_metadata[f"part-{part_i}"]["part_graph_graphbolt"] = rel_path_result
+
+
 def _update_node_edge_map(node_map_val, edge_map_val, g, num_parts):
     """
     If the original graph contains few nodes or edges for specific node/edge
@@ -1303,6 +1341,7 @@ def get_homogeneous(g, balance_ntypes):
         "ntypes": ntypes,
         "etypes": etypes,
     }
+    part_config = os.path.join(out_path, graph_name + ".json")
     for part_id in range(num_parts):
         part = parts[part_id]
 
@@ -1425,30 +1464,54 @@ def get_homogeneous(g, balance_ntypes):
         part_dir = os.path.join(out_path, "part" + str(part_id))
         node_feat_file = os.path.join(part_dir, "node_feat.dgl")
         edge_feat_file = os.path.join(part_dir, "edge_feat.dgl")
-        part_graph_file = os.path.join(part_dir, "graph.dgl")
-        part_metadata["part-{}".format(part_id)] = {
-            "node_feats": os.path.relpath(node_feat_file, out_path),
-            "edge_feats": os.path.relpath(edge_feat_file, out_path),
-            "part_graph": os.path.relpath(part_graph_file, out_path),
-        }
+
         os.makedirs(part_dir, mode=0o775, exist_ok=True)
         save_tensors(node_feat_file, node_feats)
         save_tensors(edge_feat_file, edge_feats)
 
+        part_metadata["part-{}".format(part_id)] = {
+            "node_feats": os.path.relpath(node_feat_file, out_path),
+            "edge_feats": os.path.relpath(edge_feat_file, out_path),
+        }
         sort_etypes = len(g.etypes) > 1
-        _save_graphs(
-            part_graph_file,
-            [part],
-            formats=graph_formats,
-            sort_etypes=sort_etypes,
-        )
-    print(
-        "Save partitions: {:.3f} seconds, peak memory: {:.3f} GB".format(
-            time.time() - start, get_peak_mem()
-        )
-    )
+        part = process_partitions(part, graph_formats, sort_etypes)
+
+    # transmit to graphbolt and save graph
+    if use_graphbolt:
+        # save FusedCSCSamplingGraph
+        kwargs["graph_formats"] = graph_formats
+        n_jobs = kwargs.pop("n_jobs", 1)
+        mp_ctx = mp.get_context("spawn")
+        with concurrent.futures.ProcessPoolExecutor(  # pylint: disable=unexpected-keyword-arg
+            max_workers=min(num_parts, n_jobs),
+            mp_context=mp_ctx,
+        ) as executor:
+            for part_id in range(num_parts):
+                executor.submit(
+                    _partition_to_graphbolt(
+                        part_i=part_id,
+                        part_config=part_config,
+                        part_metadata=part_metadata,
+                        parts=parts,
+                        **kwargs,
+                    )
+                )
+        part_metadata["node_map_dtype"] = "int64"
+        part_metadata["edge_map_dtype"] = "int64"
+    else:
+        for part_id, part in parts.items():
+            part_dir = os.path.join(out_path, "part" + str(part_id))
+            part_graph_file = os.path.join(part_dir, "graph.dgl")
+            part_metadata["part-{}".format(part_id)][
+                "part_graph"
+            ] = os.path.relpath(part_graph_file, out_path)
+            # save DGLGraph
+            _save_dgl_graphs(
+                part_graph_file,
+                [part],
+                formats=graph_formats,
+            )
 
-    part_config = os.path.join(out_path, graph_name + ".json")
     _dump_part_config(part_config, part_metadata)
 
     num_cuts = sim_g.num_edges() - tot_num_inner_edges
@@ -1460,12 +1523,11 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
-    if use_graphbolt:
-        kwargs["graph_formats"] = graph_formats
-        dgl_partition_to_graphbolt(
-            part_config,
-            **kwargs,
+    print(
+        "Save partitions: {:.3f} seconds, peak memory: {:.3f} GB".format(
+            time.time() - start, get_peak_mem()
         )
+    )
 
     if return_mapping:
         return orig_nids, orig_eids
@@ -1513,20 +1575,142 @@ def init_type_per_edge(graph, gpb):
     return etype_ids
 
 
-def gb_convert_single_dgl_partition(
-    part_id,
-    graph_formats,
-    part_config,
-    store_eids,
+def _load_part(part_config, part_id, parts=None):
+    """load parts from variable or dist."""
+    if parts is None:
+        graph, _, _, _, _, _, _ = load_partition(
+            part_config, part_id, load_feats=False
+        )
+    else:
+        graph = parts[part_id]
+    return graph
+
+
+def _save_graph_gb(part_config, part_id, csc_graph):
+    csc_graph_save_dir = os.path.join(
+        os.path.dirname(part_config),
+        f"part{part_id}",
+    )
+    csc_graph_path = os.path.join(
+        csc_graph_save_dir, "fused_csc_sampling_graph.pt"
+    )
+    torch.save(csc_graph, csc_graph_path)
+
+    return os.path.relpath(csc_graph_path, os.path.dirname(part_config))
+
+
+def cast_various_to_minimum_dtype_gb(
+    graph,
+    part_meta,
+    num_parts,
+    indptr,
+    indices,
+    type_per_edge,
+    etypes,
+    ntypes,
+    node_attributes,
+    edge_attributes,
+):
+    """Cast various data to minimum dtype."""
+    # Cast 1: indptr.
+    indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr)
+    # Cast 2: indices.
+    indices = _cast_to_minimum_dtype(graph.num_nodes(), indices)
+    # Cast 3: type_per_edge.
+    type_per_edge = _cast_to_minimum_dtype(
+        len(etypes), type_per_edge, field=ETYPE
+    )
+    # Cast 4: node/edge_attributes.
+    predicates = {
+        NID: part_meta["num_nodes"],
+        "part_id": num_parts,
+        NTYPE: len(ntypes),
+        EID: part_meta["num_edges"],
+        ETYPE: len(etypes),
+        DGL2GB_EID: part_meta["num_edges"],
+        GB_DST_ID: part_meta["num_nodes"],
+    }
+    for attributes in [node_attributes, edge_attributes]:
+        for key in attributes:
+            if key not in predicates:
+                continue
+            attributes[key] = _cast_to_minimum_dtype(
+                predicates[key], attributes[key], field=key
+            )
+    return indptr, indices, type_per_edge
+
+
+def _create_attributes_gb(
+    graph,
+    gpb,
+    edge_ids,
+    is_homo,
     store_inner_node,
     store_inner_edge,
+    store_eids,
+    debug_mode,
+):
+    # Save node attributes. Detailed attributes are shown below.
+    #  DGL_GB\Attributes  dgl.NID("_ID")  dgl.NTYPE("_TYPE")  "inner_node"  "part_id"
+    #  DGL_Homograph           ✅                🚫                  ✅          ✅
+    #  GB_Homograph            ✅                🚫               optional       🚫
+    #  DGL_Heterograph         ✅                ✅                  ✅          ✅
+    #  GB_Heterograph          ✅                🚫               optional       🚫
+    required_node_attrs = [NID]
+    if store_inner_node:
+        required_node_attrs.append("inner_node")
+    if debug_mode:
+        required_node_attrs = list(graph.ndata.keys())
+    node_attributes = {attr: graph.ndata[attr] for attr in required_node_attrs}
+
+    # Save edge attributes. Detailed attributes are shown below.
+    #  DGL_GB\Attributes  dgl.EID("_ID")  dgl.ETYPE("_TYPE")  "inner_edge"
+    #  DGL_Homograph           ✅               🚫                  ✅
+    #  GB_Homograph         optional            🚫               optional
+    #  DGL_Heterograph         ✅               ✅                  ✅
+    #  GB_Heterograph       optional            ✅               optional
+    type_per_edge = None
+    if not is_homo:
+        type_per_edge = init_type_per_edge(graph, gpb)[edge_ids]
+        type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
+    required_edge_attrs = []
+    if store_eids:
+        required_edge_attrs.append(EID)
+    if store_inner_edge:
+        required_edge_attrs.append("inner_edge")
+    if debug_mode:
+        required_edge_attrs = list(graph.edata.keys())
+    edge_attributes = {
+        attr: graph.edata[attr][edge_ids] for attr in required_edge_attrs
+    }
+    return node_attributes, edge_attributes, type_per_edge
+
+
+def gb_convert_single_dgl_partition(
+    ntypes,
+    etypes,
+    gpb,
+    part_meta,
+    graph,
+    graph_formats=None,
+    store_eids=False,
+    store_inner_node=False,
+    store_inner_edge=False,
 ):
     """Converts a single DGL partition to GraphBolt.
 
     Parameters
     ----------
-    part_id : int
-        The numerical ID of the partition to convert.
+    node types : dict
+        The node types
+    edge types : dict
+        The edge types
+    gpb : GraphPartitionBook
+        The global partition information.
+    part_meta : dict
+        Contain the meta data of the partition.
+    graph : DGLGraph
+        The graph to be converted to graphbolt graph.
     graph_formats : str or list[str], optional
         Save partitions in specified formats. It could be any combination of
         `coo`, `csc`. As `csc` format is mandatory for `FusedCSCSamplingGraph`,
@@ -1547,14 +1731,8 @@ def gb_convert_single_dgl_partition(
             "Running in debug mode which means all attributes of DGL partitions"
             " will be saved to the new format."
         )
-
-    part_meta = _load_part_config(part_config)
     num_parts = part_meta["num_parts"]
 
-    graph, _, _, gpb, _, _, _ = load_partition(
-        part_config, part_id, load_feats=False
-    )
-    _, _, ntypes, etypes = load_partition_book(part_config, part_id)
     is_homo = is_homogeneous(ntypes, etypes)
     node_type_to_id = (
         None if is_homo else {ntype: ntid for ntid, ntype in enumerate(ntypes)}
@@ -1569,39 +1747,16 @@ def gb_convert_single_dgl_partition(
     # Obtain CSC indtpr and indices.
     indptr, indices, edge_ids = graph.adj_tensors("csc")
 
-    # Save node attributes. Detailed attributes are shown below.
-    #  DGL_GB\Attributes  dgl.NID("_ID")  dgl.NTYPE("_TYPE")  "inner_node"  "part_id"
-    #  DGL_Homograph           ✅                🚫                  ✅          ✅
-    #  GB_Homograph            ✅                🚫               optional       🚫
-    #  DGL_Heterograph         ✅                ✅                  ✅          ✅
-    #  GB_Heterograph          ✅                🚫               optional       🚫
-    required_node_attrs = [NID]
-    if store_inner_node:
-        required_node_attrs.append("inner_node")
-    if debug_mode:
-        required_node_attrs = list(graph.ndata.keys())
-    node_attributes = {attr: graph.ndata[attr] for attr in required_node_attrs}
-
-    # Save edge attributes. Detailed attributes are shown below.
-    #  DGL_GB\Attributes  dgl.EID("_ID")  dgl.ETYPE("_TYPE")  "inner_edge"
-    #  DGL_Homograph           ✅               🚫                  ✅
-    #  GB_Homograph         optional            🚫               optional
-    #  DGL_Heterograph         ✅               ✅                  ✅
-    #  GB_Heterograph       optional            ✅               optional
-    type_per_edge = None
-    if not is_homo:
-        type_per_edge = init_type_per_edge(graph, gpb)[edge_ids]
-        type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
-    required_edge_attrs = []
-    if store_eids:
-        required_edge_attrs.append(EID)
-    if store_inner_edge:
-        required_edge_attrs.append("inner_edge")
-    if debug_mode:
-        required_edge_attrs = list(graph.edata.keys())
-    edge_attributes = {
-        attr: graph.edata[attr][edge_ids] for attr in required_edge_attrs
-    }
+    node_attributes, edge_attributes, type_per_edge = _create_attributes_gb(
+        graph,
+        gpb,
+        edge_ids,
+        is_homo,
+        store_inner_node,
+        store_inner_edge,
+        store_eids,
+        debug_mode,
+    )
     # When converting DGLGraph to FusedCSCSamplingGraph, edge IDs are
     # re-ordered(actually FusedCSCSamplingGraph does not have edge IDs
     # in nature). So we need to save such re-order info for any
@@ -1623,32 +1778,18 @@ def gb_convert_single_dgl_partition(
             indptr, dtype=indices.dtype
         )
 
-    # Cast various data to minimum dtype.
-    # Cast 1: indptr.
-    indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr)
-    # Cast 2: indices.
-    indices = _cast_to_minimum_dtype(graph.num_nodes(), indices)
-    # Cast 3: type_per_edge.
-    type_per_edge = _cast_to_minimum_dtype(
-        len(etypes), type_per_edge, field=ETYPE
+    indptr, indices, type_per_edge = cast_various_to_minimum_dtype_gb(
+        graph,
+        part_meta,
+        num_parts,
+        indptr,
+        indices,
+        type_per_edge,
+        etypes,
+        ntypes,
+        node_attributes,
+        edge_attributes,
     )
-    # Cast 4: node/edge_attributes.
-    predicates = {
-        NID: part_meta["num_nodes"],
-        "part_id": num_parts,
-        NTYPE: len(ntypes),
-        EID: part_meta["num_edges"],
-        ETYPE: len(etypes),
-        DGL2GB_EID: part_meta["num_edges"],
-        GB_DST_ID: part_meta["num_nodes"],
-    }
-    for attributes in [node_attributes, edge_attributes]:
-        for key in attributes:
-            if key not in predicates:
-                continue
-            attributes[key] = _cast_to_minimum_dtype(
-                predicates[key], attributes[key], field=key
-            )
 
     csc_graph = gb.fused_csc_sampling_graph(
         indptr,
@@ -1660,17 +1801,128 @@ def gb_convert_single_dgl_partition(
         node_type_to_id=node_type_to_id,
         edge_type_to_id=edge_type_to_id,
     )
-    orig_graph_path = os.path.join(
-        os.path.dirname(part_config),
-        part_meta[f"part-{part_id}"]["part_graph"],
+    return csc_graph
+
+
+def _convert_partition_to_graphbolt(
+    part_config,
+    part_id,
+    graph_formats=None,
+    store_eids=False,
+    store_inner_node=False,
+    store_inner_edge=False,
+):
+    """
+    The pipeline converting signle partition to graphbolt.
+
+    Parameters
+    ----------
+    part_config : str
+        The path of the partition config file.
+    part_id : int
+        The partition ID.
+    graph_formats : str or list[str], optional
+        Save partitions in specified formats. It could be any combination of
+        `coo`, `csc`. As `csc` format is mandatory for `FusedCSCSamplingGraph`,
+        it is not necessary to specify this argument. It's mainly for
+        specifying `coo` format to save edge ID mapping and destination node
+        IDs. If not specified, whether to save `coo` format is determined by
+        the availability of the format in DGL partitions. Default: None.
+    store_eids : bool, optional
+        Whether to store edge IDs in the new graph. Default: True.
+    store_inner_node : bool, optional
+        Whether to store inner node mask in the new graph. Default: False.
+    store_inner_edge : bool, optional
+        Whether to store inner edge mask in the new graph. Default: False.
+
+    Returns
+    -------
+    str
+        The path csc_graph to save.
+    """
+    gpb, _, ntypes, etypes = load_partition_book(
+        part_config=part_config, part_id=part_id
     )
-    csc_graph_path = os.path.join(
-        os.path.dirname(orig_graph_path), "fused_csc_sampling_graph.pt"
+    part = _load_part(part_config, part_id)
+    part_meta = copy.deepcopy(_load_part_config(part_config))
+    csc_graph = gb_convert_single_dgl_partition(
+        graph=part,
+        ntypes=ntypes,
+        etypes=etypes,
+        gpb=gpb,
+        part_meta=part_meta,
+        graph_formats=graph_formats,
+        store_eids=store_eids,
+        store_inner_node=store_inner_node,
+        store_inner_edge=store_inner_edge,
     )
-    torch.save(csc_graph, csc_graph_path)
+    rel_path = _save_graph_gb(part_config, part_id, csc_graph)
+    return rel_path
 
-    return os.path.relpath(csc_graph_path, os.path.dirname(part_config))
-    # Update graph path.
+
+def _convert_partition_to_graphbolt_wrapper(
+    graph_formats,
+    part_config,
+    store_eids,
+    store_inner_node,
+    store_inner_edge,
+    n_jobs,
+    num_parts,
+):
+    # [Rui] DGL partitions are always saved as homogeneous graphs even though
+    # the original graph is heterogeneous. But heterogeneous information like
+    # node/edge types are saved as node/edge data alongside with partitions.
+    # What needs more attention is that due to the existence of HALO nodes in
+    # each partition, the local node IDs are not sorted according to the node
+    # types. So we fail to assign ``node_type_offset`` as required by GraphBolt.
+    # But this is not a problem since such information is not used in sampling.
+    # We can simply pass None to it.
+
+    # Iterate over partitions.
+    convert_with_format = partial(
+        _convert_partition_to_graphbolt,
+        part_config=part_config,
+        graph_formats=graph_formats,
+        store_eids=store_eids,
+        store_inner_node=store_inner_node,
+        store_inner_edge=store_inner_edge,
+    )
+    # Need to create entirely new interpreters, because we call C++ downstream
+    # See https://docs.python.org/3.12/library/multiprocessing.html#contexts-and-start-methods
+    # and https://pybind11.readthedocs.io/en/stable/advanced/misc.html#global-interpreter-lock-gil
+    rel_path_results = []
+    if n_jobs > 1 and num_parts > 1:
+        mp_ctx = mp.get_context("spawn")
+        with concurrent.futures.ProcessPoolExecutor(  # pylint: disable=unexpected-keyword-arg
+            max_workers=min(num_parts, n_jobs),
+            mp_context=mp_ctx,
+        ) as executor:
+            for part_id in range(num_parts):
+                rel_path_results.append(
+                    executor.submit(
+                        convert_with_format, part_id=part_id
+                    ).result()
+                )
+
+    else:
+        # If running single-threaded, avoid spawning new interpreter, which is slow
+        for part_id in range(num_parts):
+            rel_path = convert_with_format(part_id=part_id)
+            rel_path_results.append(rel_path)
+    part_meta = _load_part_config(part_config)
+    for part_id in range(num_parts):
+        # Update graph path.
+        part_meta[f"part-{part_id}"]["part_graph_graphbolt"] = rel_path_results[
+            part_id
+        ]
+
+    # Save dtype info into partition config.
+    # [TODO][Rui] Always use int64_t for node/edge IDs in GraphBolt. See more
+    # details in #7175.
+    part_meta["node_map_dtype"] = "int64"
+    part_meta["edge_map_dtype"] = "int64"
+
+    return part_meta
 
 
 def dgl_partition_to_graphbolt(
@@ -1719,59 +1971,14 @@ def dgl_partition_to_graphbolt(
             " will be saved to the new format."
         )
     part_meta = _load_part_config(part_config)
-    new_part_meta = copy.deepcopy(part_meta)
     num_parts = part_meta["num_parts"]
-
-    # [Rui] DGL partitions are always saved as homogeneous graphs even though
-    # the original graph is heterogeneous. But heterogeneous information like
-    # node/edge types are saved as node/edge data alongside with partitions.
-    # What needs more attention is that due to the existence of HALO nodes in
-    # each partition, the local node IDs are not sorted according to the node
-    # types. So we fail to assign ``node_type_offset`` as required by GraphBolt.
-    # But this is not a problem since such information is not used in sampling.
-    # We can simply pass None to it.
-
-    # Iterate over partitions.
-    convert_with_format = partial(
-        gb_convert_single_dgl_partition,
+    part_meta = _convert_partition_to_graphbolt_wrapper(
         graph_formats=graph_formats,
         part_config=part_config,
         store_eids=store_eids,
         store_inner_node=store_inner_node,
         store_inner_edge=store_inner_edge,
+        n_jobs=n_jobs,
+        num_parts=num_parts,
     )
-    # Need to create entirely new interpreters, because we call C++ downstream
-    # See https://docs.python.org/3.12/library/multiprocessing.html#contexts-and-start-methods
-    # and https://pybind11.readthedocs.io/en/stable/advanced/misc.html#global-interpreter-lock-gil
-    rel_path_results = []
-    if n_jobs > 1 and num_parts > 1:
-        mp_ctx = mp.get_context("spawn")
-        with concurrent.futures.ProcessPoolExecutor(  # pylint: disable=unexpected-keyword-arg
-            max_workers=min(num_parts, n_jobs),
-            mp_context=mp_ctx,
-        ) as executor:
-            futures = []
-            for part_id in range(num_parts):
-                futures.append(executor.submit(convert_with_format, part_id))
-
-        for part_id in range(num_parts):
-            rel_path_results.append(futures[part_id].result())
-    else:
-        # If running single-threaded, avoid spawning new interpreter, which is slow
-        for part_id in range(num_parts):
-            rel_path_results.append(convert_with_format(part_id))
-
-    for part_id in range(num_parts):
-        # Update graph path.
-        new_part_meta[f"part-{part_id}"][
-            "part_graph_graphbolt"
-        ] = rel_path_results[part_id]
-
-    # Save dtype info into partition config.
-    # [TODO][Rui] Always use int64_t for node/edge IDs in GraphBolt. See more
-    # details in #7175.
-    new_part_meta["node_map_dtype"] = "int64"
-    new_part_meta["edge_map_dtype"] = "int64"
-
-    _dump_part_config(part_config, new_part_meta)
-    print(f"Converted partitions to GraphBolt format into {part_config}")
+    _dump_part_config(part_config, part_meta)

From 189b83c28c702aecce512629d4c1d6d4a538f032 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 12 Sep 2024 17:12:31 -0400
Subject: [PATCH 03/10] [GraphBolt][CUDA] Get `world_size=1` somewhat for
 cooperative sampling. (#7796)

---
 python/dgl/graphbolt/impl/neighbor_sampler.py | 134 ++++++++++++++++--
 python/dgl/graphbolt/subgraph_sampler.py      |  94 ++++++++----
 .../pytorch/graphbolt/test_dataloader.py      |  27 +++-
 3 files changed, 214 insertions(+), 41 deletions(-)

diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 6ba83941bbde..352dedc067f2 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -3,6 +3,7 @@
 from functools import partial
 
 import torch
+import torch.distributed as thd
 from torch.utils.data import functional_datapipe
 from torch.utils.data.datapipes.iter import Mapper
 
@@ -12,10 +13,14 @@
     index_select,
     ORIGINAL_EDGE_ID,
 )
-from ..internal import compact_csc_format, unique_and_compact_csc_formats
+from ..internal import (
+    compact_csc_format,
+    unique_and_compact,
+    unique_and_compact_csc_formats,
+)
 from ..minibatch_transformer import MiniBatchTransformer
 
-from ..subgraph_sampler import SubgraphSampler
+from ..subgraph_sampler import all_to_all, revert_to_homo, SubgraphSampler
 from .fused_csc_sampling_graph import fused_csc_sampling_graph
 from .sampled_subgraph_impl import SampledSubgraphImpl
 
@@ -455,12 +460,32 @@ def _subtract_hetero_indices_offset(
 class CompactPerLayer(MiniBatchTransformer):
     """Compact the sampled edges for a single layer."""
 
-    def __init__(self, datapipe, deduplicate, asynchronous=False):
+    def __init__(
+        self, datapipe, deduplicate, cooperative=False, asynchronous=False
+    ):
         self.deduplicate = deduplicate
+        self.cooperative = cooperative
         if asynchronous and deduplicate:
             datapipe = datapipe.transform(self._compact_per_layer_async)
             datapipe = datapipe.buffer()
-            super().__init__(datapipe, self._compact_per_layer_wait_future)
+            datapipe = datapipe.transform(self._compact_per_layer_wait_future)
+            if cooperative:
+                datapipe = datapipe.transform(
+                    self._seeds_cooperative_exchange_1
+                )
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(
+                    self._seeds_cooperative_exchange_2
+                )
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(
+                    self._seeds_cooperative_exchange_3
+                )
+                datapipe = datapipe.buffer()
+                datapipe = datapipe.transform(
+                    self._seeds_cooperative_exchange_4
+                )
+            super().__init__(datapipe)
         else:
             super().__init__(datapipe, self._compact_per_layer)
 
@@ -498,19 +523,20 @@ def _compact_per_layer_async(self, minibatch):
         subgraph = minibatch.sampled_subgraphs[0]
         seeds = minibatch._seed_nodes
         assert self.deduplicate
+        rank = thd.get_rank() if self.cooperative else 0
+        world_size = thd.get_world_size() if self.cooperative else 1
         minibatch._future = unique_and_compact_csc_formats(
-            subgraph.sampled_csc, seeds, async_op=True
+            subgraph.sampled_csc, seeds, rank, world_size, async_op=True
         )
         return minibatch
 
-    @staticmethod
-    def _compact_per_layer_wait_future(minibatch):
+    def _compact_per_layer_wait_future(self, minibatch):
         subgraph = minibatch.sampled_subgraphs[0]
         seeds = minibatch._seed_nodes
         (
             original_row_node_ids,
             compacted_csc_format,
-            _,
+            seeds_offsets,
         ) = minibatch._future.wait()
         delattr(minibatch, "_future")
         subgraph = SampledSubgraphImpl(
@@ -521,6 +547,87 @@ def _compact_per_layer_wait_future(minibatch):
         )
         minibatch._seed_nodes = original_row_node_ids
         minibatch.sampled_subgraphs[0] = subgraph
+        if self.cooperative:
+            subgraph._seeds_offsets = seeds_offsets
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_1(minibatch):
+        world_size = thd.get_world_size()
+        subgraph = minibatch.sampled_subgraphs[0]
+        seeds_offsets = subgraph._seeds_offsets
+        is_homogeneous = not isinstance(seeds_offsets, dict)
+        if is_homogeneous:
+            seeds_offsets = {"_N": seeds_offsets}
+        num_ntypes = len(seeds_offsets)
+        counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64)
+        for i, offsets in enumerate(seeds_offsets.values()):
+            counts_sent[
+                torch.arange(i, world_size * num_ntypes, num_ntypes)
+            ] = offsets.diff()
+        counts_received = torch.empty_like(counts_sent)
+        subgraph._counts_future = all_to_all(
+            counts_received.split(num_ntypes),
+            counts_sent.split(num_ntypes),
+            async_op=True,
+        )
+        subgraph._counts_sent = counts_sent
+        subgraph._counts_received = counts_received
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_2(minibatch):
+        world_size = thd.get_world_size()
+        seeds = minibatch._seed_nodes
+        is_homogenous = not isinstance(seeds, dict)
+        if is_homogenous:
+            seeds = {"_N": seeds}
+        subgraph = minibatch.sampled_subgraphs[0]
+        subgraph._counts_future.wait()
+        delattr(subgraph, "_counts_future")
+        num_ntypes = len(seeds.keys())
+        seeds_received = {}
+        counts_sent = {}
+        counts_received = {}
+        for i, (ntype, typed_seeds) in enumerate(seeds.items()):
+            idx = torch.arange(i, world_size * num_ntypes, num_ntypes)
+            typed_counts_sent = subgraph._counts_sent[idx].tolist()
+            typed_counts_received = subgraph._counts_received[idx].tolist()
+            typed_seeds_received = typed_seeds.new_empty(
+                sum(typed_counts_received)
+            )
+            all_to_all(
+                typed_seeds_received.split(typed_counts_received),
+                typed_seeds.split(typed_counts_sent),
+            )
+            seeds_received[ntype] = typed_seeds_received
+        subgraph._seeds_received = seeds_received
+        subgraph._counts_sent = revert_to_homo(counts_sent)
+        subgraph._counts_received = revert_to_homo(counts_received)
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_3(minibatch):
+        subgraph = minibatch.sampled_subgraphs[0]
+        nodes = {
+            ntype: [typed_seeds]
+            for ntype, typed_seeds in subgraph._seeds_received.items()
+        }
+        minibatch._unique_future = unique_and_compact(
+            nodes, 0, 1, async_op=True
+        )
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_4(minibatch):
+        unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait()
+        delattr(minibatch, "_unique_future")
+        inverse_seeds = {
+            ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items()
+        }
+        minibatch._seed_nodes = revert_to_homo(unique_seeds)
+        subgraph = minibatch.sampled_subgraphs[0]
+        subgraph._seed_inverse_ids = revert_to_homo(inverse_seeds)
         return minibatch
 
 
@@ -541,6 +648,7 @@ def __init__(
         overlap_fetch,
         num_gpu_cached_edges,
         gpu_cache_threshold,
+        cooperative,
         asynchronous,
         layer_dependency=None,
         batch_dependency=None,
@@ -561,6 +669,7 @@ def __init__(
             deduplicate,
             sampler,
             overlap_fetch,
+            cooperative=cooperative,
             asynchronous=asynchronous,
             layer_dependency=layer_dependency,
         )
@@ -637,6 +746,7 @@ def sampling_stages(
         deduplicate,
         sampler,
         overlap_fetch,
+        cooperative,
         asynchronous,
         layer_dependency,
     ):
@@ -653,7 +763,9 @@ def sampling_stages(
             datapipe = datapipe.sample_per_layer(
                 sampler, fanout, replace, prob_name, overlap_fetch, asynchronous
             )
-            datapipe = datapipe.compact_per_layer(deduplicate, asynchronous)
+            datapipe = datapipe.compact_per_layer(
+                deduplicate, cooperative, asynchronous
+            )
             if is_labor and not layer_dependency:
                 datapipe = datapipe.transform(self._increment_seed)
         if is_labor:
@@ -775,6 +887,7 @@ def __init__(
         overlap_fetch=False,
         num_gpu_cached_edges=0,
         gpu_cache_threshold=1,
+        cooperative=False,
         asynchronous=False,
     ):
         super().__init__(
@@ -788,6 +901,7 @@ def __init__(
             overlap_fetch,
             num_gpu_cached_edges,
             gpu_cache_threshold,
+            cooperative,
             asynchronous,
         )
 
@@ -937,6 +1051,7 @@ def __init__(
         overlap_fetch=False,
         num_gpu_cached_edges=0,
         gpu_cache_threshold=1,
+        cooperative=False,
         asynchronous=False,
     ):
         super().__init__(
@@ -950,6 +1065,7 @@ def __init__(
             overlap_fetch,
             num_gpu_cached_edges,
             gpu_cache_threshold,
+            cooperative,
             asynchronous,
             layer_dependency,
             batch_dependency,
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index 88fdd3808714..556950982fb7 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -15,6 +15,8 @@
 
 __all__ = [
     "SubgraphSampler",
+    "all_to_all",
+    "revert_to_homo",
 ]
 
 
@@ -41,10 +43,48 @@ def all_to_all(outputs, inputs, group=None, async_op=False):
     `rank, ..., world_size - 1, 0, ..., rank - 1` and we make it
     `0, world_size - 1` before calling `thd.all_to_all`."""
     shift_fn = partial(_shift, group=group)
-    return thd.all_to_all(shift_fn(outputs), shift_fn(inputs), group, async_op)
-
-
-def _revert_to_homo(d: dict):
+    outputs = shift_fn(list(outputs))
+    inputs = shift_fn(list(inputs))
+    if outputs[0].is_cuda:
+        return thd.all_to_all(outputs, inputs, group, async_op)
+    # gloo backend will be used.
+    outputs_single = torch.cat(outputs)
+    output_split_sizes = [o.size(0) for o in outputs]
+    handle = thd.all_to_all_single(
+        outputs_single,
+        torch.cat(inputs),
+        output_split_sizes,
+        [i.size(0) for i in inputs],
+        group,
+        async_op,
+    )
+    temp_outputs = outputs_single.split(output_split_sizes)
+
+    class _Waiter:
+        def __init__(self, handle, outputs, temp_outputs):
+            self.handle = handle
+            self.outputs = outputs
+            self.temp_outputs = temp_outputs
+
+        def wait(self):
+            """Returns the stored value when invoked."""
+            handle = self.handle
+            outputs = self.outputs
+            temp_outputs = self.temp_outputs
+            # Ensure that there is no leak
+            self.handle = self.outputs = self.temp_outputs = None
+
+            if handle is not None:
+                handle.wait()
+            for output, temp_output in zip(outputs, temp_outputs):
+                output.copy_(temp_output)
+
+    post_processor = _Waiter(handle, outputs, temp_outputs)
+    return post_processor if async_op else post_processor.wait()
+
+
+def revert_to_homo(d: dict):
+    """Utility function to convert a dictionary that stores homogenous data."""
     is_homogenous = len(d) == 1 and "_N" in d
     return list(d.values())[0] if is_homogenous else d
 
@@ -148,45 +188,31 @@ def _wait_preprocess_future(minibatch, cooperative: bool):
     def _seeds_cooperative_exchange_1(minibatch, group=None):
         rank = thd.get_rank(group)
         world_size = thd.get_world_size(group)
-        assert world_size > 1
         seeds = minibatch._seed_nodes
         is_homogeneous = not isinstance(seeds, dict)
         if is_homogeneous:
             seeds = {"_N": seeds}
         if minibatch._seeds_offsets is None:
             seeds_list = list(seeds.values())
-            (
-                sorted_seeds_list,
-                index_list,
-                offsets_list,
-            ) = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size)
+            result = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size)
             assert minibatch.compacted_seeds is None
             sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {}
             num_ntypes = len(seeds.keys())
             for i, (
                 seed_type,
-                typed_sorted_seeds,
-                typed_index,
-                typed_offsets,
-            ) in enumerate(
-                zip(
-                    seeds.keys(),
-                    sorted_seeds_list,
-                    index_list,
-                    offsets_list,
-                )
-            ):
+                (typed_sorted_seeds, typed_index, typed_offsets),
+            ) in enumerate(zip(seeds.keys(), result)):
                 sorted_seeds[seed_type] = typed_sorted_seeds
                 sorted_compacted[seed_type] = typed_index
-                sorted_offsets[seed_type] = typed_offsets.tolist()
+                sorted_offsets[seed_type] = typed_offsets
 
             minibatch._seed_nodes = sorted_seeds
-            minibatch.compacted_seeds = sorted_compacted
+            minibatch.compacted_seeds = revert_to_homo(sorted_compacted)
             minibatch._seeds_offsets = sorted_offsets
         else:
             minibatch._seeds_offsets = {"_N": minibatch._seeds_offsets}
         counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64)
-        for i, offsets in enumerate(minibatch._seeds_offsets[0].values()):
+        for i, offsets in enumerate(minibatch._seeds_offsets.values()):
             counts_sent[
                 torch.arange(i, world_size * num_ntypes, num_ntypes)
             ] = offsets.diff()
@@ -208,7 +234,6 @@ def _seeds_cooperative_exchange_2(minibatch, group=None):
         seeds = minibatch._seed_nodes
         minibatch._counts_future.wait()
         delattr(minibatch, "_counts_future")
-        counts_received = minibatch._counts_received
         num_ntypes = len(seeds.keys())
         seeds_received = {}
         counts_sent = {}
@@ -226,15 +251,19 @@ def _seeds_cooperative_exchange_2(minibatch, group=None):
                 group,
             )
             seeds_received[ntype] = typed_seeds_received
-        minibatch._seed_nodes = _revert_to_homo(seeds_received)
-        minibatch._counts_sent = _revert_to_homo(counts_sent)
-        minibatch._counts_received = _revert_to_homo(counts_received)
+        minibatch._seed_nodes = seeds_received
+        minibatch._counts_sent = revert_to_homo(counts_sent)
+        minibatch._counts_received = revert_to_homo(counts_received)
         return minibatch
 
     @staticmethod
     def _seeds_cooperative_exchange_3(minibatch):
+        nodes = {
+            ntype: [typed_seeds]
+            for ntype, typed_seeds in minibatch._seed_nodes.items()
+        }
         minibatch._unique_future = unique_and_compact(
-            minibatch._seed_nodes, 0, 1, async_op=True
+            nodes, 0, 1, async_op=True
         )
         return minibatch
 
@@ -242,8 +271,11 @@ def _seeds_cooperative_exchange_3(minibatch):
     def _seeds_cooperative_exchange_4(minibatch):
         unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait()
         delattr(minibatch, "_unique_future")
-        minibatch._seed_nodes = _revert_to_homo(unique_seeds)
-        minibatch._seed_inverse_ids = _revert_to_homo(inverse_seeds)
+        inverse_seeds = {
+            ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items()
+        }
+        minibatch._seed_nodes = revert_to_homo(unique_seeds)
+        minibatch._seed_inverse_ids = revert_to_homo(inverse_seeds)
         return minibatch
 
     def _sample(self, minibatch):
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 666ab352d29c..b02c820dd60d 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -1,4 +1,6 @@
+import os
 import unittest
+from sys import platform
 
 import backend as F
 
@@ -6,6 +8,7 @@
 import dgl.graphbolt
 import pytest
 import torch
+import torch.distributed as thd
 
 from dgl.graphbolt.datapipes import find_dps, traverse_dps
 
@@ -63,6 +66,7 @@ def test_DataLoader(overlap_feature_fetch):
 @pytest.mark.parametrize("enable_feature_fetch", [True, False])
 @pytest.mark.parametrize("overlap_feature_fetch", [True, False])
 @pytest.mark.parametrize("overlap_graph_fetch", [True, False])
+@pytest.mark.parametrize("cooperative", [True, False])
 @pytest.mark.parametrize("asynchronous", [True, False])
 @pytest.mark.parametrize("num_gpu_cached_edges", [0, 1024])
 @pytest.mark.parametrize("gpu_cache_threshold", [1, 3])
@@ -71,10 +75,23 @@ def test_gpu_sampling_DataLoader(
     enable_feature_fetch,
     overlap_feature_fetch,
     overlap_graph_fetch,
+    cooperative,
     asynchronous,
     num_gpu_cached_edges,
     gpu_cache_threshold,
 ):
+    if cooperative and not thd.is_initialized():
+        # On Windows, the init method can only be file.
+        init_method = (
+            f"file:///{os.path.join(os.getcwd(), 'dis_tempfile')}"
+            if platform == "win32"
+            else "tcp://127.0.0.1:12345"
+        )
+        thd.init_process_group(
+            init_method=init_method,
+            world_size=1,
+            rank=0,
+        )
     N = 40
     B = 4
     num_layers = 2
@@ -110,6 +127,7 @@ def test_gpu_sampling_DataLoader(
             "overlap_fetch": overlap_graph_fetch,
             "num_gpu_cached_edges": num_gpu_cached_edges,
             "gpu_cache_threshold": gpu_cache_threshold,
+            "cooperative": cooperative,
             "asynchronous": asynchronous,
         }
         if i != 0:
@@ -118,7 +136,7 @@ def test_gpu_sampling_DataLoader(
             datapipe,
             graph,
             fanouts=[torch.LongTensor([2]) for _ in range(num_layers)],
-            **kwargs
+            **kwargs,
         )
         if enable_feature_fetch:
             datapipe = dgl.graphbolt.FeatureFetcher(
@@ -138,6 +156,11 @@ def test_gpu_sampling_DataLoader(
             bufferer_cnt += 2 * num_layers
     if asynchronous:
         bufferer_cnt += 2 * num_layers + 1  # _preprocess stage has 1.
+        if cooperative:
+            bufferer_cnt += 3 * num_layers
+    if cooperative:
+        # _preprocess stage and each sampling layer.
+        bufferer_cnt += 3
     datapipe_graph = traverse_dps(dataloader)
     bufferers = find_dps(
         datapipe_graph,
@@ -171,3 +194,5 @@ def test_gpu_sampling_DataLoader(
                 if sampler_name == "LayerNeighborSampler":
                     assert torch.equal(edge_feature, edge_feature_ref)
     assert len(list(dataloader)) == N // B
+    if thd.is_initialized():
+        thd.destroy_process_group()

From 864b023c47f9f43fb7c9219602ba1072278bc0e0 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Fri, 13 Sep 2024 00:46:41 -0400
Subject: [PATCH 04/10] [GraphBolt][CUDA] Add `CooperativeConv` and minor
 fixes. (#7797)

---
 python/dgl/graphbolt/impl/__init__.py         |   1 +
 python/dgl/graphbolt/impl/cooperative_conv.py | 109 ++++++++++++++++++
 python/dgl/graphbolt/impl/neighbor_sampler.py |  32 ++++-
 python/dgl/graphbolt/subgraph_sampler.py      |  15 +++
 .../pytorch/graphbolt/test_dataloader.py      |  19 +++
 5 files changed, 173 insertions(+), 3 deletions(-)
 create mode 100644 python/dgl/graphbolt/impl/cooperative_conv.py

diff --git a/python/dgl/graphbolt/impl/__init__.py b/python/dgl/graphbolt/impl/__init__.py
index 19fef44e462c..f4e53327c3ae 100644
--- a/python/dgl/graphbolt/impl/__init__.py
+++ b/python/dgl/graphbolt/impl/__init__.py
@@ -15,3 +15,4 @@
 from .gpu_graph_cache import *
 from .cpu_feature_cache import *
 from .cpu_cached_feature import *
+from .cooperative_conv import *
diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py
new file mode 100644
index 000000000000..28f11bc8b317
--- /dev/null
+++ b/python/dgl/graphbolt/impl/cooperative_conv.py
@@ -0,0 +1,109 @@
+"""Graphbolt cooperative convolution."""
+from typing import Dict, Union
+
+import torch
+
+from ..sampled_subgraph import SampledSubgraph
+from ..subgraph_sampler import all_to_all, convert_to_hetero, revert_to_homo
+
+__all__ = ["CooperativeConvFunction", "CooperativeConv"]
+
+
+class CooperativeConvFunction(torch.autograd.Function):
+    """Cooperative convolution operation from Cooperative Minibatching.
+
+    Implements the `all-to-all` message passing algorithm
+    in Cooperative Minibatching, which was initially proposed in
+    `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__ and
+    was later first fully described in
+    `Cooperative Minibatching in Graph Neural Networks
+    <https://arxiv.org/abs/2310.12403>`__.
+    Cooperation between the GPUs eliminates duplicate work performed across the
+    GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when
+    performing GNN minibatching. This reduces the redundant computations across
+    GPUs at the expense of communication.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        subgraph: SampledSubgraph,
+        tensor: Union[torch.Tensor, Dict[str, torch.Tensor]],
+    ):
+        """Implements the forward pass."""
+        counts_sent = convert_to_hetero(subgraph._counts_sent)
+        counts_received = convert_to_hetero(subgraph._counts_received)
+        seed_inverse_ids = convert_to_hetero(subgraph._seed_inverse_ids)
+        seed_sizes = convert_to_hetero(subgraph._seed_sizes)
+        ctx.save_for_backward(
+            counts_sent, counts_received, seed_inverse_ids, seed_sizes
+        )
+        outs = {}
+        for ntype, typed_tensor in convert_to_hetero(tensor).items():
+            out = typed_tensor.new_empty(
+                (sum(counts_sent[ntype]),) + typed_tensor.shape[1:]
+            )
+            all_to_all(
+                torch.split(out, counts_sent[ntype]),
+                torch.split(
+                    typed_tensor[seed_inverse_ids[ntype]],
+                    counts_received[ntype],
+                ),
+            )
+            outs[ntype] = out
+        return revert_to_homo(out)
+
+    @staticmethod
+    def backward(
+        ctx, grad_output: Union[torch.Tensor, Dict[str, torch.Tensor]]
+    ):
+        """Implements the forward pass."""
+        (
+            counts_sent,
+            counts_received,
+            seed_inverse_ids,
+            seed_sizes,
+        ) = ctx.saved_tensors
+        outs = {}
+        for ntype, typed_grad_output in convert_to_hetero(grad_output).items():
+            out = typed_grad_output.new_empty(
+                (sum(counts_received[ntype]),) + typed_grad_output.shape[1:]
+            )
+            all_to_all(
+                torch.split(out, counts_received[ntype]),
+                torch.split(typed_grad_output, counts_sent[ntype]),
+            )
+            i = out.new_empty(2, out.shape[0], dtype=torch.int64)
+            i[0] = torch.arange(
+                out.shape[0], device=typed_grad_output.device
+            )  # src
+            i[1] = seed_inverse_ids[ntype]  # dst
+            coo = torch.sparse_coo_tensor(
+                i, 1, size=(seed_sizes[ntype], i.shape[1])
+            )
+            outs[ntype] = torch.sparse.mm(coo, out)
+        return None, revert_to_homo(outs)
+
+
+class CooperativeConv(torch.nn.Module):
+    """Cooperative convolution operation from Cooperative Minibatching.
+
+    Implements the `all-to-all` message passing algorithm
+    in Cooperative Minibatching, which was initially proposed in
+    `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__ and
+    was later first fully described in
+    `Cooperative Minibatching in Graph Neural Networks
+    <https://arxiv.org/abs/2310.12403>`__.
+    Cooperation between the GPUs eliminates duplicate work performed across the
+    GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when
+    performing GNN minibatching. This reduces the redundant computations across
+    GPUs at the expense of communication.
+    """
+
+    def forward(
+        self,
+        subgraph: SampledSubgraph,
+        x: Union[torch.Tensor, Dict[str, torch.Tensor]],
+    ):
+        """Implements the forward pass."""
+        return CooperativeConvFunction.apply(subgraph, x)
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 352dedc067f2..7ddba6d7ccac 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -601,17 +601,18 @@ def _seeds_cooperative_exchange_2(minibatch):
                 typed_seeds.split(typed_counts_sent),
             )
             seeds_received[ntype] = typed_seeds_received
-        subgraph._seeds_received = seeds_received
+            counts_sent[ntype] = typed_counts_sent
+            counts_received[ntype] = typed_counts_received
+        minibatch._seed_nodes = seeds_received
         subgraph._counts_sent = revert_to_homo(counts_sent)
         subgraph._counts_received = revert_to_homo(counts_received)
         return minibatch
 
     @staticmethod
     def _seeds_cooperative_exchange_3(minibatch):
-        subgraph = minibatch.sampled_subgraphs[0]
         nodes = {
             ntype: [typed_seeds]
-            for ntype, typed_seeds in subgraph._seeds_received.items()
+            for ntype, typed_seeds in minibatch._seed_nodes.items()
         }
         minibatch._unique_future = unique_and_compact(
             nodes, 0, 1, async_op=True
@@ -627,6 +628,11 @@ def _seeds_cooperative_exchange_4(minibatch):
         }
         minibatch._seed_nodes = revert_to_homo(unique_seeds)
         subgraph = minibatch.sampled_subgraphs[0]
+        sizes = {
+            ntype: typed_seeds.size(0)
+            for ntype, typed_seeds in unique_seeds.items()
+        }
+        subgraph._seed_sizes = revert_to_homo(sizes)
         subgraph._seed_inverse_ids = revert_to_homo(inverse_seeds)
         return minibatch
 
@@ -831,6 +837,16 @@ class NeighborSampler(NeighborSamplerImpl):
     gpu_cache_threshold : int, optional
         Determines how many times a vertex needs to be accessed before its
         neighborhood ends up being cached on the GPU.
+    cooperative: bool, optional
+        Boolean indicating whether Cooperative Minibatching, which was initially
+        proposed in
+        `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__
+        and was later first fully described in
+        `Cooperative Minibatching in Graph Neural Networks
+        <https://arxiv.org/abs/2310.12403>`__. Cooperation between the GPUs
+        eliminates duplicate work performed across the GPUs due to the
+        overlapping sampled k-hop neighborhoods of seed nodes when performing
+        GNN minibatching.
     asynchronous: bool
         Boolean indicating whether sampling and compaction stages should run
         in background threads to hide the latency of CPU GPU synchronization.
@@ -986,6 +1002,16 @@ class LayerNeighborSampler(NeighborSamplerImpl):
     gpu_cache_threshold : int, optional
         Determines how many times a vertex needs to be accessed before its
         neighborhood ends up being cached on the GPU.
+    cooperative: bool, optional
+        Boolean indicating whether Cooperative Minibatching, which was initially
+        proposed in
+        `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__
+        and was later first fully described in
+        `Cooperative Minibatching in Graph Neural Networks
+        <https://arxiv.org/abs/2310.12403>`__. Cooperation between the GPUs
+        eliminates duplicate work performed across the GPUs due to the
+        overlapping sampled k-hop neighborhoods of seed nodes when performing
+        GNN minibatching.
     asynchronous: bool
         Boolean indicating whether sampling and compaction stages should run
         in background threads to hide the latency of CPU GPU synchronization.
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index 556950982fb7..dd5093ae5f69 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -16,6 +16,7 @@
 __all__ = [
     "SubgraphSampler",
     "all_to_all",
+    "convert_to_hetero",
     "revert_to_homo",
 ]
 
@@ -89,6 +90,13 @@ def revert_to_homo(d: dict):
     return list(d.values())[0] if is_homogenous else d
 
 
+def convert_to_hetero(item):
+    """Utility function to convert homogenous data to heterogenous with a single
+    node type."""
+    is_heterogenous = isinstance(item, dict)
+    return item if is_heterogenous else {"_N": item}
+
+
 @functional_datapipe("sample_subgraph")
 class SubgraphSampler(MiniBatchTransformer):
     """A subgraph sampler used to sample a subgraph from a given set of nodes
@@ -251,6 +259,8 @@ def _seeds_cooperative_exchange_2(minibatch, group=None):
                 group,
             )
             seeds_received[ntype] = typed_seeds_received
+            counts_sent[ntype] = typed_counts_sent
+            counts_received[ntype] = typed_counts_received
         minibatch._seed_nodes = seeds_received
         minibatch._counts_sent = revert_to_homo(counts_sent)
         minibatch._counts_received = revert_to_homo(counts_received)
@@ -275,6 +285,11 @@ def _seeds_cooperative_exchange_4(minibatch):
             ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items()
         }
         minibatch._seed_nodes = revert_to_homo(unique_seeds)
+        sizes = {
+            ntype: typed_seeds.size(0)
+            for ntype, typed_seeds in unique_seeds.items()
+        }
+        minibatch._seed_sizes = revert_to_homo(sizes)
         minibatch._seed_inverse_ids = revert_to_homo(inverse_seeds)
         return minibatch
 
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index b02c820dd60d..5d5d44fd1eb7 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -6,6 +6,7 @@
 
 import dgl
 import dgl.graphbolt
+import dgl.graphbolt as gb
 import pytest
 import torch
 import torch.distributed as thd
@@ -194,5 +195,23 @@ def test_gpu_sampling_DataLoader(
                 if sampler_name == "LayerNeighborSampler":
                     assert torch.equal(edge_feature, edge_feature_ref)
     assert len(list(dataloader)) == N // B
+
+    if asynchronous and cooperative:
+        for minibatch in minibatches:
+            x = torch.ones((minibatch.node_ids().size(0), 1), device=F.ctx())
+            for subgraph in minibatch.sampled_subgraphs:
+                x = gb.CooperativeConvFunction.apply(subgraph, x)
+                x, edge_index, size = subgraph.to_pyg(x)
+                x = x[0]
+                one = torch.ones(
+                    edge_index.shape[1], dtype=x.dtype, device=x.device
+                )
+                coo = torch.sparse_coo_tensor(
+                    edge_index.flipud(), one, size=(size[1], size[0])
+                )
+                x = torch.sparse.mm(coo, x)
+            assert x.shape[0] == minibatch.seeds.shape[0]
+            assert x.shape[1] == 1
+
     if thd.is_initialized():
         thd.destroy_process_group()

From 55c224a304a0f1683c320b3219f6d0308163528b Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Sat, 14 Sep 2024 20:57:44 -0400
Subject: [PATCH 05/10] [GraphBolt][CUDA] Cooperative Minibatching - Feature
 Loading (#7798)

---
 python/dgl/graphbolt/feature_fetcher.py       | 41 +++++++++++++++++--
 python/dgl/graphbolt/impl/cooperative_conv.py |  2 +-
 .../pytorch/graphbolt/test_dataloader.py      |  3 ++
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/python/dgl/graphbolt/feature_fetcher.py b/python/dgl/graphbolt/feature_fetcher.py
index 75dab4d54cd2..cf9d5f4104c2 100644
--- a/python/dgl/graphbolt/feature_fetcher.py
+++ b/python/dgl/graphbolt/feature_fetcher.py
@@ -8,6 +8,7 @@
 from torch.utils.data import functional_datapipe
 
 from .base import etype_tuple_to_str
+from .impl.cooperative_conv import CooperativeConvFunction
 
 from .minibatch_transformer import MiniBatchTransformer
 
@@ -73,6 +74,16 @@ class FeatureFetcher(MiniBatchTransformer):
         If True, the feature fetcher will overlap the UVA feature fetcher
         operations with the rest of operations by using an alternative CUDA
         stream or utilizing asynchronous operations. Default is True.
+    cooperative: bool, optional
+        Boolean indicating whether Cooperative Minibatching, which was initially
+        proposed in
+        `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__
+        and was later first fully described in
+        `Cooperative Minibatching in Graph Neural Networks
+        <https://arxiv.org/abs/2310.12403>`__. Cooperation between the GPUs
+        eliminates duplicate work performed across the GPUs due to the
+        overlapping sampled k-hop neighborhoods of seed nodes when performing
+        GNN minibatching.
     """
 
     def __init__(
@@ -82,6 +93,7 @@ def __init__(
         node_feature_keys=None,
         edge_feature_keys=None,
         overlap_fetch=True,
+        cooperative=False,
     ):
         datapipe = datapipe.mark_feature_fetcher_start()
         self.feature_store = feature_store
@@ -113,9 +125,12 @@ def __init__(
             datapipe = datapipe.transform(
                 partial(self._execute_stage, i)
             ).buffer(1)
-        super().__init__(
-            datapipe, self._identity if max_val == 0 else self._final_stage
-        )
+        if max_val > 0:
+            datapipe = datapipe.transform(self._final_stage)
+        if cooperative:
+            datapipe = datapipe.transform(self._cooperative_exchange)
+            datapipe = datapipe.buffer()
+        super().__init__(datapipe)
         # A positive value indicates that the overlap optimization is enabled.
         self.max_num_stages = max_val
 
@@ -145,6 +160,26 @@ def _final_stage(data):
                 features[key] = value.wait()
         return data
 
+    def _cooperative_exchange(self, data):
+        subgraph = data.sampled_subgraphs[0]
+        is_heterogeneous = isinstance(
+            self.node_feature_keys, Dict
+        ) or isinstance(self.edge_feature_keys, Dict)
+        if is_heterogeneous:
+            node_features = {key: {} for key, _ in data.node_features.keys()}
+            for (key, ntype), feature in data.node_features.items():
+                node_features[key][ntype] = feature
+            for key, feature in node_features.items():
+                new_feature = CooperativeConvFunction.apply(subgraph, feature)
+                for ntype, tensor in new_feature.items():
+                    data.node_features[(key, ntype)] = tensor
+        else:
+            for key in data.node_features:
+                feature = data.node_features[key]
+                new_feature = CooperativeConvFunction.apply(subgraph, feature)
+                data.node_features[key] = new_feature
+        return data
+
     def _read(self, data):
         """
         Fill in the node/edge features field in data.
diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py
index 28f11bc8b317..cb3d39d4d980 100644
--- a/python/dgl/graphbolt/impl/cooperative_conv.py
+++ b/python/dgl/graphbolt/impl/cooperative_conv.py
@@ -57,7 +57,7 @@ def forward(
     def backward(
         ctx, grad_output: Union[torch.Tensor, Dict[str, torch.Tensor]]
     ):
-        """Implements the forward pass."""
+        """Implements the backward pass."""
         (
             counts_sent,
             counts_received,
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index 5d5d44fd1eb7..ee8f2b0cb9f5 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -146,6 +146,7 @@ def test_gpu_sampling_DataLoader(
                 ["a", "b", "c"],
                 ["d"],
                 overlap_fetch=overlap_feature_fetch and i == 0,
+                cooperative=asynchronous and cooperative and i == 0,
             )
         dataloaders.append(dgl.graphbolt.DataLoader(datapipe))
     dataloader, dataloader2 = dataloaders
@@ -159,6 +160,8 @@ def test_gpu_sampling_DataLoader(
         bufferer_cnt += 2 * num_layers + 1  # _preprocess stage has 1.
         if cooperative:
             bufferer_cnt += 3 * num_layers
+            if enable_feature_fetch:
+                bufferer_cnt += 1  # feature fetch has 1.
     if cooperative:
         # _preprocess stage and each sampling layer.
         bufferer_cnt += 3

From 75e410c354f050f1458d1d9b9e36d5804008f7e4 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Tue, 17 Sep 2024 16:15:19 -0400
Subject: [PATCH 06/10] [GraphBolt][CUDA] Fix Cooperative bug and additional
 tests. (#7800)

---
 .../src/cuda/extension/unique_and_compact_map.cu |  8 ++++++--
 python/dgl/graphbolt/impl/cooperative_conv.py    | 16 ++++++++++++----
 .../impl/test_cooperative_minibatching_utils.py  | 15 +++++++++++++--
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index a36c63925d7f..3db918ee7fdb 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -284,14 +284,18 @@ UniqueAndCompactBatchedHashMapBased(
               unique_ids_offsets_dev.data_ptr<int64_t>();
         }
         at::cuda::CUDAEvent unique_ids_offsets_event;
+        unique_ids_offsets_event.record();
         torch::optional<torch::Tensor> index;
         if (part_ids) {
+          unique_ids_offsets_event.synchronize();
+          const auto num_unique =
+              unique_ids_offsets.data_ptr<int64_t>()[num_batches];
+          unique_ids = unique_ids.slice(0, 0, num_unique);
+          part_ids = part_ids->slice(0, 0, num_unique);
           std::tie(
               unique_ids, index, unique_ids_offsets, unique_ids_offsets_event) =
               cuda::RankSortImpl(
                   unique_ids, *part_ids, unique_ids_offsets_dev, world_size);
-        } else {
-          unique_ids_offsets_event.record();
         }
         auto mapped_ids =
             torch::empty(offsets_ptr[3 * num_batches], unique_ids.options());
diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py
index cb3d39d4d980..8040dabd286c 100644
--- a/python/dgl/graphbolt/impl/cooperative_conv.py
+++ b/python/dgl/graphbolt/impl/cooperative_conv.py
@@ -35,8 +35,11 @@ def forward(
         counts_received = convert_to_hetero(subgraph._counts_received)
         seed_inverse_ids = convert_to_hetero(subgraph._seed_inverse_ids)
         seed_sizes = convert_to_hetero(subgraph._seed_sizes)
-        ctx.save_for_backward(
-            counts_sent, counts_received, seed_inverse_ids, seed_sizes
+        ctx.communication_variables = (
+            counts_sent,
+            counts_received,
+            seed_inverse_ids,
+            seed_sizes,
         )
         outs = {}
         for ntype, typed_tensor in convert_to_hetero(tensor).items():
@@ -63,7 +66,8 @@ def backward(
             counts_received,
             seed_inverse_ids,
             seed_sizes,
-        ) = ctx.saved_tensors
+        ) = ctx.communication_variables
+        delattr(ctx, "communication_variables")
         outs = {}
         for ntype, typed_grad_output in convert_to_hetero(grad_output).items():
             out = typed_grad_output.new_empty(
@@ -79,7 +83,11 @@ def backward(
             )  # src
             i[1] = seed_inverse_ids[ntype]  # dst
             coo = torch.sparse_coo_tensor(
-                i, 1, size=(seed_sizes[ntype], i.shape[1])
+                i,
+                torch.ones(
+                    i.shape[1], dtype=grad_output.dtype, device=i.device
+                ),
+                size=(seed_sizes[ntype], i.shape[1]),
             )
             outs[ntype] = torch.sparse.mm(coo, out)
         return None, revert_to_homo(outs)
diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
index f85676578bd5..1de8669b0e08 100644
--- a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
+++ b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
@@ -19,9 +19,10 @@
 @pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("rank", list(range(WORLD_SIZE)))
 def test_gpu_cached_feature_read_async(dtype, rank):
+    torch.manual_seed(7)
     nodes_list1 = [
-        torch.randint(0, 11111111, [777], dtype=dtype, device=F.ctx())
-        for i in range(10)
+        torch.randint(0, 2111111111, [777], dtype=dtype, device=F.ctx())
+        for _ in range(10)
     ]
     nodes_list2 = [nodes.sort()[0] for nodes in nodes_list1]
 
@@ -57,3 +58,13 @@ def test_gpu_cached_feature_read_async(dtype, rank):
             assert_equal(
                 idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]]
             )
+
+    unique, compacted, offsets = gb.unique_and_compact(
+        nodes_list1[:1], rank, WORLD_SIZE
+    )
+
+    nodes1, idx1, offsets1 = res1[0]
+
+    assert_equal(unique, nodes1)
+    assert_equal(compacted[0], idx1)
+    assert_equal(offsets, offsets1)

From d3453c3f14132db06443ff067649ff8f73339c17 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Wed, 18 Sep 2024 22:04:12 -0400
Subject: [PATCH 07/10] [GraphBolt][CUDA] Add Cooperative Minibatching example.
 (#7801)

---
 .../pyg/multigpu/node_classification.py       | 485 ++++++++++++++++++
 1 file changed, 485 insertions(+)
 create mode 100644 examples/graphbolt/pyg/multigpu/node_classification.py

diff --git a/examples/graphbolt/pyg/multigpu/node_classification.py b/examples/graphbolt/pyg/multigpu/node_classification.py
new file mode 100644
index 000000000000..d2884b6a87f9
--- /dev/null
+++ b/examples/graphbolt/pyg/multigpu/node_classification.py
@@ -0,0 +1,485 @@
+"""
+This script demonstrates node classification with GraphSAGE on large graphs, 
+merging GraphBolt (GB) and PyTorch Geometric (PyG). GraphBolt efficiently manages 
+data loading for large datasets, crucial for mini-batch processing. Post data 
+loading, PyG's user-friendly framework takes over for training, showcasing seamless 
+integration with GraphBolt. This combination offers an efficient alternative to 
+traditional Deep Graph Library (DGL) methods, highlighting adaptability and 
+scalability in handling large-scale graph data for diverse real-world applications.
+
+
+
+Key Features:
+- Implements the GraphSAGE model, a scalable GNN, for node classification on large graphs.
+- Utilizes GraphBolt, an efficient framework for large-scale graph data processing.
+- Integrates with PyTorch Geometric for building and training the GraphSAGE model.
+- The script is well-documented, providing clear explanations at each step.
+
+This flowchart describes the main functional sequence of the provided example.
+main: 
+
+main
+│
+├───> Load and preprocess dataset (GraphBolt)
+│     │
+│     └───> Utilize GraphBolt's BuiltinDataset for dataset handling
+│
+├───> Instantiate the SAGE model (PyTorch Geometric)
+│     │
+│     └───> Define the GraphSAGE model architecture
+│
+├───> Train the model
+│     │
+│     ├───> Mini-Batch Processing with GraphBolt
+│     │     │
+│     │     └───> Efficient handling of mini-batches using GraphBolt's utilities
+│     │
+│     └───> Training Loop
+│           │
+│           ├───> Forward and backward passes
+│           │
+│           └───> Parameters optimization
+│
+└───> Evaluate the model
+      │
+      └───> Performance assessment on validation and test datasets
+            │
+            └───> Accuracy and other relevant metrics calculation
+
+
+"""
+
+import argparse
+import os
+import time
+
+import dgl.graphbolt as gb
+import torch
+
+# For torch.compile until https://github.com/pytorch/pytorch/issues/121197 is
+# resolved.
+import torch._inductor.codecache
+
+torch._dynamo.config.cache_size_limit = 32
+
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn.functional as F
+from torch_geometric.nn import SAGEConv
+from tqdm import tqdm
+
+
+def accuracy(out, labels):
+    assert out.ndim == 2
+    assert out.size(0) == labels.size(0)
+    assert labels.ndim == 1 or (labels.ndim == 2 and labels.size(1) == 1)
+    labels = labels.flatten()
+    predictions = torch.argmax(out, 1)
+    return (labels == predictions).sum(dtype=torch.float64) / labels.size(0)
+
+
+class GraphSAGE(torch.nn.Module):
+    #####################################################################
+    # (HIGHLIGHT) Define the GraphSAGE model architecture.
+    #
+    # - This class inherits from `torch.nn.Module`.
+    # - Two convolutional layers are created using the SAGEConv class from PyG.
+    # - 'in_size', 'hidden_size', 'out_size' are the sizes of
+    #   the input, hidden, and output features, respectively.
+    # - The forward method defines the computation performed at every call.
+    #####################################################################
+    def __init__(self, in_size, hidden_size, out_size, n_layers, cooperative):
+        super(GraphSAGE, self).__init__()
+        self.layers = torch.nn.ModuleList()
+        sizes = [in_size] + [hidden_size] * (n_layers - 1) + [out_size]
+        for i in range(n_layers):
+            self.layers.append(SAGEConv(sizes[i], sizes[i + 1]))
+        self.hidden_size = hidden_size
+        self.out_size = out_size
+        self.cooperative = cooperative
+
+    def forward(self, minibatch, x):
+        subgraphs = minibatch.sampled_subgraphs
+        h = x
+        for i, (layer, subgraph) in enumerate(zip(self.layers, subgraphs)):
+            #####################################################################
+            # (HIGHLIGHT) Convert given features to be consumed by a PyG layer.
+            #
+            #   PyG layers have two modes, bipartite and normal. We slice the
+            #   given features to get src and dst features to use the PyG layers
+            #   in the more efficient bipartite mode.
+            #####################################################################
+            if i != 0 and self.cooperative:
+                h = gb.CooperativeConvFunction.apply(subgraph, h)
+            h, edge_index, size = subgraph.to_pyg(h)
+            h = layer(h, edge_index, size=size)
+            if i != len(subgraphs) - 1:
+                h = F.relu(h)
+        if self.cooperative:
+            h = gb.CooperativeConvFunction.apply(minibatch, h)
+            h = h[minibatch.compacted_seeds]
+        return h
+
+
+def create_dataloader(
+    args, graph, features, itemset, batch_size, fanout, device, job
+):
+    #####################################################################
+    # (HIGHLIGHT) Create a data loader for efficiently loading graph data.
+    #
+    # - 'ItemSampler' samples mini-batches of node IDs from the dataset.
+    # - 'CopyTo' copies the fetched data to the specified device.
+    # - 'sample_neighbor' performs neighbor sampling on the graph.
+    # - 'FeatureFetcher' fetches node features based on the sampled subgraph.
+
+    #####################################################################
+    # Create a datapipe for mini-batch sampling with a specific neighbor fanout.
+    # Here, [10, 10, 10] specifies the number of neighbors sampled for each node at each layer.
+    # We're using `sample_neighbor` for consistency with DGL's sampling API.
+    # Note: GraphBolt offers additional sampling methods, such as `sample_layer_neighbor`,
+    # which could provide further optimization and efficiency for GNN training.
+    # Users are encouraged to explore these advanced features for potentially improved performance.
+
+    # Initialize an ItemSampler to sample mini-batches from the dataset.
+    datapipe = gb.DistributedItemSampler(
+        itemset,
+        batch_size=batch_size,
+        shuffle=(job == "train"),
+        drop_last=(job == "train"),
+        drop_uneven_inputs=True,
+    )
+    need_copy = True
+    # Copy the data to the specified device.
+    if args.graph_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+    # Sample neighbors for each node in the mini-batch.
+    datapipe = getattr(datapipe, args.sample_mode)(
+        graph,
+        fanout if job != "infer" else [-1],
+        overlap_fetch=args.overlap_graph_fetch,
+        num_gpu_cached_edges=args.num_gpu_cached_edges,
+        gpu_cache_threshold=args.gpu_graph_caching_threshold,
+        cooperative=args.cooperative,
+        asynchronous=args.graph_device != "cpu",
+    )
+    # Copy the data to the specified device.
+    if args.feature_device != "cpu" and need_copy:
+        datapipe = datapipe.copy_to(device=device)
+        need_copy = False
+    # Fetch node features for the sampled subgraph.
+    datapipe = datapipe.fetch_feature(
+        features,
+        node_feature_keys=["feat"],
+        overlap_fetch=args.overlap_feature_fetch,
+        cooperative=args.cooperative,
+    )
+    # Copy the data to the specified device.
+    if need_copy:
+        datapipe = datapipe.copy_to(device=device)
+    # Create and return a DataLoader to handle data loading.
+    return gb.DataLoader(datapipe, num_workers=args.num_workers)
+
+
+def weighted_reduce(tensor, weight, dst=0):
+    ########################################################################
+    # (HIGHLIGHT) Collect accuracy and loss values from sub-processes and
+    # obtain overall average values.
+    #
+    # `torch.distributed.reduce` is used to reduce tensors from all the
+    # sub-processes to a specified process, ReduceOp.SUM is used by default.
+    #
+    # Because the GPUs may have differing numbers of processed items, we
+    # perform a weighted mean to calculate the exact loss and accuracy.
+    ########################################################################
+    dist.reduce(tensor=tensor, dst=dst)
+    weight = torch.tensor(weight, device=tensor.device)
+    dist.reduce(tensor=weight, dst=dst)
+    return tensor / weight
+
+
+@torch.compile
+def train_step(minibatch, optimizer, model, loss_fn, cooperative):
+    node_features = minibatch.node_features["feat"]
+    labels = minibatch.labels
+    optimizer.zero_grad()
+    out = model(minibatch, node_features)
+    loss = loss_fn(out, labels)
+    num_correct = accuracy(out, labels) * labels.size(0)
+    loss.backward()
+    optimizer.step()
+    return loss.detach(), num_correct, labels.size(0)
+
+
+def train_helper(
+    rank, dataloader, model, optimizer, loss_fn, device, cooperative
+):
+    model.train()  # Set the model to training mode
+    total_loss = torch.zeros(1, device=device)  # Accumulator for the total loss
+    # Accumulator for the total number of correct predictions
+    total_correct = torch.zeros(1, dtype=torch.float64, device=device)
+    total_samples = 0  # Accumulator for the total number of samples processed
+    num_batches = 0  # Counter for the number of mini-batches processed
+    start = time.time()
+    for minibatch in tqdm(dataloader, "Training") if rank == 0 else dataloader:
+        loss, num_correct, num_samples = train_step(
+            minibatch, optimizer, model, loss_fn, cooperative
+        )
+        total_loss += loss
+        total_correct += num_correct
+        total_samples += num_samples
+        num_batches += 1
+    train_loss = weighted_reduce(total_loss, num_batches)
+    train_acc = weighted_reduce(total_correct, total_samples)
+    end = time.time()
+    return train_loss, train_acc, end - start
+
+
+def train(args, rank, train_dataloader, valid_dataloader, model, device):
+    #####################################################################
+    # (HIGHLIGHT) Train the model for one epoch.
+    #
+    # - Iterates over the data loader, fetching mini-batches of graph data.
+    # - For each mini-batch, it performs a forward pass, computes loss, and
+    #   updates the model parameters.
+    # - The function returns the average loss and accuracy for the epoch.
+    #
+    # Parameters:
+    #   model: The GraphSAGE model.
+    #   dataloader: DataLoader that provides mini-batches of graph data.
+    #   optimizer: Optimizer used for updating model parameters.
+    #   loss_fn: Loss function used for training.
+    #   device: The device (CPU/GPU) to run the training on.
+    #####################################################################
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fn = torch.nn.CrossEntropyLoss()
+
+    for epoch in range(args.epochs):
+        train_loss, train_acc, duration = train_helper(
+            rank,
+            train_dataloader,
+            model,
+            optimizer,
+            loss_fn,
+            device,
+            args.cooperative,
+        )
+        val_acc = evaluate(rank, model, valid_dataloader, device)
+        if rank == 0:
+            print(
+                f"Epoch {epoch:02d}, Loss: {train_loss.item():.4f}, "
+                f"Approx. Train: {train_acc.item():.4f}, "
+                f"Approx. Val: {val_acc.item():.4f}, "
+                f"Time: {duration}s"
+            )
+
+
+@torch.compile
+def evaluate_step(minibatch, model):
+    node_features = minibatch.node_features["feat"]
+    labels = minibatch.labels
+    out = model(minibatch, node_features)
+    num_correct = accuracy(out, labels) * labels.size(0)
+    return num_correct, labels.size(0)
+
+
+@torch.no_grad()
+def evaluate(rank, model, dataloader, device):
+    model.eval()
+    total_correct = torch.zeros(1, dtype=torch.float64, device=device)
+    total_samples = 0
+    for minibatch in (
+        tqdm(dataloader, "Evaluating") if rank == 0 else dataloader
+    ):
+        num_correct, num_samples = evaluate_step(minibatch, model)
+        total_correct += num_correct
+        total_samples += num_samples
+
+    return weighted_reduce(total_correct, total_samples)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Which dataset are you going to use?"
+    )
+    parser.add_argument(
+        "--epochs", type=int, default=10, help="Number of training epochs."
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=0.003,
+        help="Learning rate for optimization.",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=1024, help="Batch size for training."
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=0,
+        help="Number of workers for data loading.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="ogbn-products",
+        choices=[
+            "ogbn-arxiv",
+            "ogbn-products",
+            "ogbn-papers100M",
+            "igb-hom-tiny",
+            "igb-hom-small",
+            "igb-hom-medium",
+            "igb-hom-large",
+            "igb-hom",
+        ],
+        help="The dataset we can use for node classification example. Currently"
+        " ogbn-products, ogbn-arxiv, ogbn-papers100M and"
+        " igb-hom-[tiny|small|medium|large] and igb-hom datasets are supported.",
+    )
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="10,10,10",
+        help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)"
+        " identical with the number of layers in your model. Default: 10,10,10",
+    )
+    parser.add_argument(
+        "--mode",
+        default="pinned-pinned-cuda",
+        choices=[
+            "pinned-pinned-cuda",
+            "cuda-pinned-cuda",
+            "cuda-cuda-cuda",
+        ],
+        help="Graph storage - feature storage - Train device: 'cpu' for CPU and RAM,"
+        " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.",
+    )
+    parser.add_argument(
+        "--gpu-cache-size",
+        type=int,
+        default=0,
+        help="The capacity of the GPU cache in bytes.",
+    )
+    parser.add_argument(
+        "--sample-mode",
+        default="sample_neighbor",
+        choices=["sample_neighbor", "sample_layer_neighbor"],
+        help="The sampling function when doing layerwise sampling.",
+    )
+    parser.add_argument(
+        "--num-gpu-cached-edges",
+        type=int,
+        default=0,
+        help="The number of edges to be cached from the graph on the GPU.",
+    )
+    parser.add_argument(
+        "--gpu-graph-caching-threshold",
+        type=int,
+        default=1,
+        help="The number of accesses after which a vertex neighborhood will be cached.",
+    )
+    parser.add_argument("--precision", type=str, default="high")
+    parser.add_argument(
+        "--cooperative",
+        action="store_true",
+        help="Enables Cooperative Minibatching from arXiv:2310.12403.",
+    )
+    return parser.parse_args()
+
+
+def run(rank, world_size, args, dataset):
+    # Set up multiprocessing environment.
+    torch.cuda.set_device(rank)
+    dist.init_process_group(
+        init_method="tcp://127.0.0.1:12345",
+        rank=rank,
+        world_size=world_size,
+    )
+
+    print(f"Training in {args.mode} mode.")
+    args.graph_device, args.feature_device, args.device = args.mode.split("-")
+    args.overlap_feature_fetch = args.feature_device == "pinned"
+    args.overlap_graph_fetch = args.graph_device == "pinned"
+
+    # Move the dataset to the selected storage.
+    graph = (
+        dataset.graph.pin_memory_()
+        if args.graph_device == "pinned"
+        else dataset.graph.to(args.graph_device)
+    )
+    features = (
+        dataset.feature.pin_memory_()
+        if args.feature_device == "pinned"
+        else dataset.feature.to(args.feature_device)
+    )
+
+    train_set = dataset.tasks[0].train_set
+    valid_set = dataset.tasks[0].validation_set
+    args.fanout = list(map(int, args.fanout.split(",")))
+
+    num_classes = dataset.tasks[0].metadata["num_classes"]
+
+    if args.gpu_cache_size > 0 and args.feature_device != "cuda":
+        features._features[("node", None, "feat")] = gb.gpu_cached_feature(
+            features._features[("node", None, "feat")],
+            args.gpu_cache_size,
+        )
+
+    train_dataloader, valid_dataloader = (
+        create_dataloader(
+            args,
+            graph=graph,
+            features=features,
+            itemset=itemset,
+            batch_size=args.batch_size,
+            fanout=args.fanout,
+            device=args.device,
+            job=job,
+        )
+        for itemset, job in zip([train_set, valid_set], ["train", "evaluate"])
+    )
+
+    in_channels = features.size("node", None, "feat")[0]
+    hidden_channels = 256
+    model = GraphSAGE(
+        in_channels,
+        hidden_channels,
+        num_classes,
+        len(args.fanout),
+        args.cooperative,
+    ).to(args.device)
+    assert len(args.fanout) == len(model.layers)
+    model = torch.nn.parallel.DistributedDataParallel(model)
+
+    train(args, rank, train_dataloader, valid_dataloader, model, args.device)
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    if not torch.cuda.is_available():
+        print("Multi-GPU training requires GPUs.")
+        exit(0)
+
+    torch.set_float32_matmul_precision(args.precision)
+
+    # Load and preprocess dataset.
+    print("Loading data...")
+    dataset = gb.BuiltinDataset(args.dataset).load()
+
+    world_size = torch.cuda.device_count()
+
+    # Thread limiting to avoid resource competition.
+    os.environ["OMP_NUM_THREADS"] = str(mp.cpu_count() // 2 // world_size)
+
+    mp.set_sharing_strategy("file_system")
+    mp.spawn(
+        run,
+        args=(world_size, args, dataset),
+        nprocs=world_size,
+        join=True,
+    )

From 3bc8e228fc87bb371d42cf97e8e8eb0159c5f8ae Mon Sep 17 00:00:00 2001
From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:05:11 +0800
Subject: [PATCH 08/10] [DistGB] enable dist partition pipeline to save
 FusedCSCSamplingGraph partition directly (#7728)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-8-126.us-west-2.compute.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-52-174.us-west-2.compute.internal>
Co-authored-by: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
---
 python/dgl/distributed/partition.py          |   52 +-
 tests/tools/test_dist_partition_graphbolt.py | 1023 ++++++++++++++++++
 tools/dispatch_data.py                       |   28 +
 tools/distpartitioning/convert_partition.py  |  409 ++++++-
 tools/distpartitioning/data_proc_pipeline.py |   25 +-
 tools/distpartitioning/data_shuffle.py       |   27 +-
 tools/distpartitioning/utils.py              |   44 +-
 7 files changed, 1520 insertions(+), 88 deletions(-)
 create mode 100644 tests/tools/test_dist_partition_graphbolt.py

diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 079ed8806a96..48005ffb4d27 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1600,8 +1600,6 @@ def _save_graph_gb(part_config, part_id, csc_graph):
 
 
 def cast_various_to_minimum_dtype_gb(
-    graph,
-    part_meta,
     num_parts,
     indptr,
     indices,
@@ -1610,25 +1608,43 @@ def cast_various_to_minimum_dtype_gb(
     ntypes,
     node_attributes,
     edge_attributes,
+    part_meta=None,
+    graph=None,
+    edge_count=None,
+    node_count=None,
+    tot_edge_count=None,
+    tot_node_count=None,
 ):
     """Cast various data to minimum dtype."""
+    if graph is not None:
+        assert part_meta is not None
+        tot_edge_count = graph.num_edges()
+        tot_node_count = graph.num_nodes()
+        node_count = part_meta["num_nodes"]
+        edge_count = part_meta["num_edges"]
+    else:
+        assert tot_edge_count is not None
+        assert tot_node_count is not None
+        assert edge_count is not None
+        assert node_count is not None
+
     # Cast 1: indptr.
-    indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr)
+    indptr = _cast_to_minimum_dtype(tot_edge_count, indptr)
     # Cast 2: indices.
-    indices = _cast_to_minimum_dtype(graph.num_nodes(), indices)
+    indices = _cast_to_minimum_dtype(tot_node_count, indices)
     # Cast 3: type_per_edge.
     type_per_edge = _cast_to_minimum_dtype(
         len(etypes), type_per_edge, field=ETYPE
     )
     # Cast 4: node/edge_attributes.
     predicates = {
-        NID: part_meta["num_nodes"],
+        NID: node_count,
         "part_id": num_parts,
         NTYPE: len(ntypes),
-        EID: part_meta["num_edges"],
+        EID: edge_count,
         ETYPE: len(etypes),
-        DGL2GB_EID: part_meta["num_edges"],
-        GB_DST_ID: part_meta["num_nodes"],
+        DGL2GB_EID: edge_count,
+        GB_DST_ID: node_count,
     }
     for attributes in [node_attributes, edge_attributes]:
         for key in attributes:
@@ -1779,16 +1795,16 @@ def gb_convert_single_dgl_partition(
         )
 
     indptr, indices, type_per_edge = cast_various_to_minimum_dtype_gb(
-        graph,
-        part_meta,
-        num_parts,
-        indptr,
-        indices,
-        type_per_edge,
-        etypes,
-        ntypes,
-        node_attributes,
-        edge_attributes,
+        graph=graph,
+        part_meta=part_meta,
+        num_parts=num_parts,
+        indptr=indptr,
+        indices=indices,
+        type_per_edge=type_per_edge,
+        etypes=etypes,
+        ntypes=ntypes,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
     )
 
     csc_graph = gb.fused_csc_sampling_graph(
diff --git a/tests/tools/test_dist_partition_graphbolt.py b/tests/tools/test_dist_partition_graphbolt.py
new file mode 100644
index 000000000000..81c16f8809c3
--- /dev/null
+++ b/tests/tools/test_dist_partition_graphbolt.py
@@ -0,0 +1,1023 @@
+import json
+import os
+import tempfile
+
+import dgl
+import dgl.backend as F
+import dgl.graphbolt as gb
+
+import numpy as np
+import pyarrow.parquet as pq
+import pytest
+import torch
+from dgl.data.utils import load_graphs, load_tensors
+from dgl.distributed.partition import (
+    _etype_str_to_tuple,
+    _etype_tuple_to_str,
+    _get_inner_edge_mask,
+    _get_inner_node_mask,
+    load_partition,
+    RESERVED_FIELD_DTYPE,
+)
+
+from distpartitioning import array_readwriter
+from distpartitioning.utils import generate_read_list
+from pytest_utils import create_chunked_dataset
+
+
+def _verify_metadata_gb(gpb, g, num_parts, part_id, part_sizes):
+    """
+    check list:
+        make sure the number of nodes and edges is correct.
+        make sure the number of parts is correct.
+        make sure the number of nodes and edges in each part is corrcet.
+    """
+    assert gpb._num_nodes() == g.num_nodes()
+    assert gpb._num_edges() == g.num_edges()
+
+    assert gpb.num_partitions() == num_parts
+    gpb_meta = gpb.metadata()
+    assert len(gpb_meta) == num_parts
+    assert len(gpb.partid2nids(part_id)) == gpb_meta[part_id]["num_nodes"]
+    assert len(gpb.partid2eids(part_id)) == gpb_meta[part_id]["num_edges"]
+    part_sizes.append(
+        (gpb_meta[part_id]["num_nodes"], gpb_meta[part_id]["num_edges"])
+    )
+
+
+def _verify_local_id_gb(part_g, part_id, gpb):
+    """
+    check list:
+        make sure the type of local id is correct.
+        make sure local id have a right order.
+    """
+    nid = F.boolean_mask(
+        part_g.node_attributes[dgl.NID],
+        part_g.node_attributes["inner_node"],
+    )
+    local_nid = gpb.nid2localnid(nid, part_id)
+    assert F.dtype(local_nid) in (F.int64, F.int32)
+    assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid)))
+    eid = F.boolean_mask(
+        part_g.edge_attributes[dgl.EID],
+        part_g.edge_attributes["inner_edge"],
+    )
+    local_eid = gpb.eid2localeid(eid, part_id)
+    assert F.dtype(local_eid) in (F.int64, F.int32)
+    assert np.all(np.sort(F.asnumpy(local_eid)) == np.arange(0, len(local_eid)))
+    return local_nid, local_eid
+
+
+def _verify_map_gb(
+    part_g,
+    part_id,
+    gpb,
+):
+    """
+    check list:
+        make sure the map node and its data type is correct.
+    """
+    # Check the node map.
+    local_nodes = F.boolean_mask(
+        part_g.node_attributes[dgl.NID],
+        part_g.node_attributes["inner_node"],
+    )
+    inner_node_index = F.nonzero_1d(part_g.node_attributes["inner_node"])
+    mapping_nodes = gpb.partid2nids(part_id)
+    assert F.dtype(mapping_nodes) in (F.int32, F.int64)
+    assert np.all(
+        np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy(mapping_nodes))
+    )
+    assert np.all(
+        F.asnumpy(inner_node_index) == np.arange(len(inner_node_index))
+    )
+
+    # Check the edge map.
+
+    local_edges = F.boolean_mask(
+        part_g.edge_attributes[dgl.EID],
+        part_g.edge_attributes["inner_edge"],
+    )
+    inner_edge_index = F.nonzero_1d(part_g.edge_attributes["inner_edge"])
+    mapping_edges = gpb.partid2eids(part_id)
+    assert F.dtype(mapping_edges) in (F.int32, F.int64)
+    assert np.all(
+        np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy(mapping_edges))
+    )
+    assert np.all(
+        F.asnumpy(inner_edge_index) == np.arange(len(inner_edge_index))
+    )
+    return local_nodes, local_edges
+
+
+def _verify_local_and_map_id_gb(
+    part_g,
+    part_id,
+    gpb,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+):
+    """
+    check list:
+        make sure local id are correct.
+        make sure mapping id are correct.
+    """
+    if store_inner_node and store_inner_edge and store_eids:
+        _verify_local_id_gb(part_g, part_id, gpb)
+        _verify_map_gb(part_g, part_id, gpb)
+
+
+def _get_part_IDs(part_g):
+    # These are partition-local IDs.
+    num_columns = part_g.csc_indptr.diff()
+    part_src_ids = part_g.indices
+    part_dst_ids = torch.arange(part_g.total_num_nodes).repeat_interleave(
+        num_columns
+    )
+    # These are reshuffled global homogeneous IDs.
+    part_src_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_src_ids)
+    part_dst_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_dst_ids)
+    return part_src_ids, part_dst_ids
+
+
+def _verify_node_type_ID_gb(part_g, gpb):
+    """
+    check list:
+        make sure ntype id have correct data type
+    """
+    part_src_ids, part_dst_ids = _get_part_IDs(part_g)
+    # These are reshuffled per-type IDs.
+    src_ntype_ids, part_src_ids = gpb.map_to_per_ntype(part_src_ids)
+    dst_ntype_ids, part_dst_ids = gpb.map_to_per_ntype(part_dst_ids)
+    # `IdMap` is in int64 by default.
+    assert src_ntype_ids.dtype == F.int64
+    assert dst_ntype_ids.dtype == F.int64
+
+    with pytest.raises(dgl.utils.internal.InconsistentDtypeException):
+        gpb.map_to_per_ntype(F.tensor([0], F.int32))
+    with pytest.raises(dgl.utils.internal.InconsistentDtypeException):
+        gpb.map_to_per_etype(F.tensor([0], F.int32))
+    return (
+        part_src_ids,
+        part_dst_ids,
+        src_ntype_ids,
+        part_src_ids,
+        dst_ntype_ids,
+    )
+
+
+def _verify_orig_edge_IDs_gb(
+    g,
+    orig_nids,
+    orig_eids,
+    part_eids,
+    part_src_ids,
+    part_dst_ids,
+    src_ntype=None,
+    dst_ntype=None,
+    etype=None,
+):
+    """
+    check list:
+        make sure orig edge id are correct after
+    """
+    if src_ntype is not None and dst_ntype is not None:
+        orig_src_nid = orig_nids[src_ntype]
+        orig_dst_nid = orig_nids[dst_ntype]
+    else:
+        orig_src_nid = orig_nids
+        orig_dst_nid = orig_nids
+    orig_src_ids = F.gather_row(orig_src_nid, part_src_ids)
+    orig_dst_ids = F.gather_row(orig_dst_nid, part_dst_ids)
+    if etype is not None:
+        orig_eids = orig_eids[etype]
+    orig_eids1 = F.gather_row(orig_eids, part_eids)
+    orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids, etype=etype)
+    assert len(orig_eids1) == len(orig_eids2)
+    assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2))
+
+
+def _verify_orig_IDs_gb(
+    part_g,
+    gpb,
+    g,
+    is_homo=False,
+    part_src_ids=None,
+    part_dst_ids=None,
+    src_ntype_ids=None,
+    dst_ntype_ids=None,
+    orig_nids=None,
+    orig_eids=None,
+):
+    """
+    check list:
+        make sure orig edge id are correct.
+        make sure hetero ntype id are correct.
+    """
+    part_eids = part_g.edge_attributes[dgl.EID]
+    if is_homo:
+        _verify_orig_edge_IDs_gb(
+            g, orig_nids, orig_eids, part_eids, part_src_ids, part_dst_ids
+        )
+        local_orig_nids = orig_nids[part_g.node_attributes[dgl.NID]]
+        local_orig_eids = orig_eids[part_g.edge_attributes[dgl.EID]]
+        part_g.node_attributes["feats"] = F.gather_row(
+            g.ndata["feats"], local_orig_nids
+        )
+        part_g.edge_attributes["feats"] = F.gather_row(
+            g.edata["feats"], local_orig_eids
+        )
+    else:
+        etype_ids, part_eids = gpb.map_to_per_etype(part_eids)
+        # `IdMap` is in int64 by default.
+        assert etype_ids.dtype == F.int64
+
+        # These are original per-type IDs.
+        for etype_id, etype in enumerate(g.canonical_etypes):
+            part_src_ids1 = F.boolean_mask(part_src_ids, etype_ids == etype_id)
+            src_ntype_ids1 = F.boolean_mask(
+                src_ntype_ids, etype_ids == etype_id
+            )
+            part_dst_ids1 = F.boolean_mask(part_dst_ids, etype_ids == etype_id)
+            dst_ntype_ids1 = F.boolean_mask(
+                dst_ntype_ids, etype_ids == etype_id
+            )
+            part_eids1 = F.boolean_mask(part_eids, etype_ids == etype_id)
+            assert np.all(F.asnumpy(src_ntype_ids1 == src_ntype_ids1[0]))
+            assert np.all(F.asnumpy(dst_ntype_ids1 == dst_ntype_ids1[0]))
+            src_ntype = g.ntypes[F.as_scalar(src_ntype_ids1[0])]
+            dst_ntype = g.ntypes[F.as_scalar(dst_ntype_ids1[0])]
+
+            _verify_orig_edge_IDs_gb(
+                g,
+                orig_nids,
+                orig_eids,
+                part_eids1,
+                part_src_ids1,
+                part_dst_ids1,
+                src_ntype,
+                dst_ntype,
+                etype,
+            )
+
+
+def _verify_constructed_id_gb(part_sizes, gpb):
+    """
+    verify the part id of each node by constructed nids.
+    check list:
+        make sure each node' part id and its type are corect
+    """
+    node_map = []
+    edge_map = []
+    for part_i, (num_nodes, num_edges) in enumerate(part_sizes):
+        node_map.append(np.ones(num_nodes) * part_i)
+        edge_map.append(np.ones(num_edges) * part_i)
+    node_map = np.concatenate(node_map)
+    edge_map = np.concatenate(edge_map)
+    nid2pid = gpb.nid2partid(F.arange(0, len(node_map)))
+    assert F.dtype(nid2pid) in (F.int32, F.int64)
+    assert np.all(F.asnumpy(nid2pid) == node_map)
+    eid2pid = gpb.eid2partid(F.arange(0, len(edge_map)))
+    assert F.dtype(eid2pid) in (F.int32, F.int64)
+    assert np.all(F.asnumpy(eid2pid) == edge_map)
+
+
+def _verify_IDs_gb(
+    g,
+    part_g,
+    part_id,
+    gpb,
+    part_sizes,
+    orig_nids,
+    orig_eids,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+    is_homo,
+):
+    # verify local id and mapping id
+    _verify_local_and_map_id_gb(
+        part_g,
+        part_id,
+        gpb,
+        store_inner_node,
+        store_inner_edge,
+        store_eids,
+    )
+
+    # Verify the mapping between the reshuffled IDs and the original IDs.
+    (
+        part_src_ids,
+        part_dst_ids,
+        src_ntype_ids,
+        part_src_ids,
+        dst_ntype_ids,
+    ) = _verify_node_type_ID_gb(part_g, gpb)
+
+    if store_eids:
+        _verify_orig_IDs_gb(
+            part_g,
+            gpb,
+            g,
+            part_src_ids=part_src_ids,
+            part_dst_ids=part_dst_ids,
+            src_ntype_ids=src_ntype_ids,
+            dst_ntype_ids=dst_ntype_ids,
+            orig_nids=orig_nids,
+            orig_eids=orig_eids,
+            is_homo=is_homo,
+        )
+    _verify_constructed_id_gb(part_sizes, gpb)
+
+
+def _collect_data_gb(
+    parts,
+    part_g,
+    gpbs,
+    gpb,
+    tot_node_feats,
+    node_feats,
+    tot_edge_feats,
+    edge_feats,
+    shuffled_labels,
+    shuffled_edata,
+    test_ntype,
+    test_etype,
+):
+    if test_ntype != None:
+        shuffled_labels.append(node_feats[test_ntype + "/label"])
+        shuffled_edata.append(
+            edge_feats[_etype_tuple_to_str(test_etype) + "/count"]
+        )
+    else:
+        shuffled_labels.append(node_feats["_N/labels"])
+        shuffled_edata.append(edge_feats["_N:_E:_N/feats"])
+    parts.append(part_g)
+    gpbs.append(gpb)
+    tot_node_feats.append(node_feats)
+    tot_edge_feats.append(edge_feats)
+
+
+def _verify_node_feats(g, part, gpb, orig_nids, node_feats, is_homo=False):
+    for ntype in g.ntypes:
+        ndata = (
+            part.node_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.ndata
+        )
+        ntype_id = g.get_ntype_id(ntype)
+        inner_node_mask = _get_inner_node_mask(
+            part,
+            ntype_id,
+            (gpb if isinstance(part, gb.FusedCSCSamplingGraph) else None),
+        )
+        inner_nids = F.boolean_mask(ndata[dgl.NID], inner_node_mask)
+        ntype_ids, inner_type_nids = gpb.map_to_per_ntype(inner_nids)
+        partid = gpb.nid2partid(inner_type_nids, ntype)
+        if is_homo:
+            assert np.all(F.asnumpy(ntype_ids) == ntype_id)
+            assert np.all(F.asnumpy(partid) == gpb.partid)
+
+        if is_homo:
+            orig_id = orig_nids[inner_type_nids]
+        else:
+            orig_id = orig_nids[ntype][inner_type_nids]
+        local_nids = gpb.nid2localnid(inner_type_nids, gpb.partid, ntype)
+
+        for name in g.nodes[ntype].data:
+            if name in [dgl.NID, "inner_node"]:
+                continue
+            true_feats = F.gather_row(g.nodes[ntype].data[name], orig_id)
+            ndata = F.gather_row(node_feats[ntype + "/" + name], local_nids)
+            assert np.all(F.asnumpy(ndata == true_feats))
+
+
+def _verify_edge_feats(g, part, gpb, orig_eids, edge_feats, is_homo=False):
+    for etype in g.canonical_etypes:
+        edata = (
+            part.edge_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.edata
+        )
+        etype_id = g.get_etype_id(etype)
+        inner_edge_mask = _get_inner_edge_mask(part, etype_id)
+        inner_eids = F.boolean_mask(edata[dgl.EID], inner_edge_mask)
+        etype_ids, inner_type_eids = gpb.map_to_per_etype(inner_eids)
+        partid = gpb.eid2partid(inner_type_eids, etype)
+        assert np.all(F.asnumpy(etype_ids) == etype_id)
+        assert np.all(F.asnumpy(partid) == gpb.partid)
+
+        if is_homo:
+            orig_id = orig_eids[inner_type_eids]
+        else:
+            orig_id = orig_eids[etype][inner_type_eids]
+        local_eids = gpb.eid2localeid(inner_type_eids, gpb.partid, etype)
+
+        for name in g.edges[etype].data:
+            if name in [dgl.EID, "inner_edge"]:
+                continue
+            true_feats = F.gather_row(g.edges[etype].data[name], orig_id)
+            edata = F.gather_row(
+                edge_feats[_etype_tuple_to_str(etype) + "/" + name],
+                local_eids,
+            )
+            assert np.all(F.asnumpy(edata == true_feats))
+
+
+def _verify_shuffled_labels_gb(
+    g,
+    shuffled_labels,
+    shuffled_edata,
+    orig_nids,
+    orig_eids,
+    test_ntype=None,
+    test_etype=None,
+):
+    """
+    check list:
+        make sure node data are correct.
+        make sure edge data are correct.
+    """
+    shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0))
+    shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0))
+    orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype)
+    orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype)
+
+    orig_nid = orig_nids if test_ntype is None else orig_nids[test_ntype]
+    orig_eid = orig_eids if test_etype is None else orig_eids[test_etype]
+    nlabel = (
+        g.ndata["labels"]
+        if test_ntype is None
+        else g.nodes[test_ntype].data["label"]
+    )
+    edata = (
+        g.edata["feats"]
+        if test_etype is None
+        else g.edges[test_etype].data["count"]
+    )
+
+    orig_labels[F.asnumpy(orig_nid)] = shuffled_labels
+    orig_edata[F.asnumpy(orig_eid)] = shuffled_edata
+    assert np.all(orig_labels == F.asnumpy(nlabel))
+    assert np.all(orig_edata == F.asnumpy(edata))
+
+
+def verify_graph_feats_gb(
+    g,
+    gpbs,
+    parts,
+    tot_node_feats,
+    tot_edge_feats,
+    orig_nids,
+    orig_eids,
+    shuffled_labels,
+    shuffled_edata,
+    test_ntype,
+    test_etype,
+    store_inner_node=False,
+    store_inner_edge=False,
+    store_eids=False,
+    is_homo=False,
+):
+    """
+    check list:
+        make sure the feats of nodes and edges are correct
+    """
+    for part_id in range(len(parts)):
+        part = parts[part_id]
+        gpb = gpbs[part_id]
+        node_feats = tot_node_feats[part_id]
+        edge_feats = tot_edge_feats[part_id]
+        if store_inner_node:
+            _verify_node_feats(
+                g,
+                part,
+                gpb,
+                orig_nids,
+                node_feats,
+                is_homo=is_homo,
+            )
+        if store_inner_edge and store_eids:
+            _verify_edge_feats(
+                g,
+                part,
+                gpb,
+                orig_eids,
+                edge_feats,
+                is_homo=is_homo,
+            )
+
+    _verify_shuffled_labels_gb(
+        g,
+        shuffled_labels,
+        shuffled_edata,
+        orig_nids,
+        orig_eids,
+        test_ntype,
+        test_etype,
+    )
+
+
+def _verify_graphbolt_attributes(
+    parts, store_inner_node, store_inner_edge, store_eids
+):
+    """
+    check list:
+        make sure arguments work.
+    """
+    for part in parts:
+        assert store_inner_edge == ("inner_edge" in part.edge_attributes)
+        assert store_inner_node == ("inner_node" in part.node_attributes)
+        assert store_eids == (dgl.EID in part.edge_attributes)
+
+
+def _verify_graphbolt_part(
+    g,
+    test_dir,
+    orig_nids,
+    orig_eids,
+    graph_name,
+    num_parts,
+    store_inner_node,
+    store_inner_edge,
+    store_eids,
+    part_config=None,
+    test_ntype=None,
+    test_etype=None,
+    is_homo=False,
+):
+    """
+    check list:
+        _verify_metadata_gb:
+            data type, ID's order and ID's number of edges and nodes
+        _verify_IDs_gb:
+            local id, mapping id,node type id, orig edge, hetero ntype id
+        verify_graph_feats_gb:
+            nodes and edges' feats
+        _verify_graphbolt_attributes:
+            arguments
+    """
+    parts = []
+    tot_node_feats = []
+    tot_edge_feats = []
+    shuffled_labels = []
+    shuffled_edata = []
+    part_sizes = []
+    gpbs = []
+    if part_config is None:
+        part_config = os.path.join(test_dir, f"{graph_name}.json")
+    # test each part
+    for part_id in range(num_parts):
+        part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+            part_config, part_id, load_feats=True, use_graphbolt=True
+        )
+        # verify metadata
+        _verify_metadata_gb(
+            gpb,
+            g,
+            num_parts,
+            part_id,
+            part_sizes,
+        )
+
+        # verify eid and nid
+        _verify_IDs_gb(
+            g,
+            part_g,
+            part_id,
+            gpb,
+            part_sizes,
+            orig_nids,
+            orig_eids,
+            store_inner_node,
+            store_inner_edge,
+            store_eids,
+            is_homo,
+        )
+
+        # collect shuffled data and parts
+        _collect_data_gb(
+            parts,
+            part_g,
+            gpbs,
+            gpb,
+            tot_node_feats,
+            node_feats,
+            tot_edge_feats,
+            edge_feats,
+            shuffled_labels,
+            shuffled_edata,
+            test_ntype,
+            test_etype,
+        )
+
+    # verify graph feats
+    verify_graph_feats_gb(
+        g,
+        gpbs,
+        parts,
+        tot_node_feats,
+        tot_edge_feats,
+        orig_nids,
+        orig_eids,
+        shuffled_labels=shuffled_labels,
+        shuffled_edata=shuffled_edata,
+        test_ntype=test_ntype,
+        test_etype=test_etype,
+        store_inner_node=store_inner_node,
+        store_inner_edge=store_inner_edge,
+        store_eids=store_eids,
+        is_homo=is_homo,
+    )
+
+    _verify_graphbolt_attributes(
+        parts, store_inner_node, store_inner_edge, store_eids
+    )
+
+    return parts
+
+
+def _verify_hetero_graph_node_edge_num(
+    g,
+    parts,
+    store_inner_edge,
+    debug_mode,
+):
+    """
+    check list:
+        make sure edge type are correct.
+        make sure the number of nodes in each node type are correct.
+        make sure the number of nodes in each node type are correct.
+    """
+    num_nodes = {ntype: 0 for ntype in g.ntypes}
+    num_edges = {etype: 0 for etype in g.canonical_etypes}
+    for part in parts:
+        edata = (
+            part.edge_attributes
+            if isinstance(part, gb.FusedCSCSamplingGraph)
+            else part.edata
+        )
+        if dgl.ETYPE in edata:
+            assert len(g.canonical_etypes) == len(F.unique(edata[dgl.ETYPE]))
+        if debug_mode or isinstance(part, dgl.DGLGraph):
+            for ntype in g.ntypes:
+                ntype_id = g.get_ntype_id(ntype)
+                inner_node_mask = _get_inner_node_mask(part, ntype_id)
+                num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0)
+                num_nodes[ntype] += num_inner_nodes
+        if store_inner_edge or isinstance(part, dgl.DGLGraph):
+            for etype in g.canonical_etypes:
+                etype_id = g.get_etype_id(etype)
+                inner_edge_mask = _get_inner_edge_mask(part, etype_id)
+                num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0)
+                num_edges[etype] += num_inner_edges
+
+    # Verify the number of nodes are correct.
+    if debug_mode or isinstance(part, dgl.DGLGraph):
+        for ntype in g.ntypes:
+            print(
+                "node {}: {}, {}".format(
+                    ntype, g.num_nodes(ntype), num_nodes[ntype]
+                )
+            )
+            assert g.num_nodes(ntype) == num_nodes[ntype]
+    # Verify the number of edges are correct.
+    if store_inner_edge or isinstance(part, dgl.DGLGraph):
+        for etype in g.canonical_etypes:
+            print(
+                "edge {}: {}, {}".format(
+                    etype, g.num_edges(etype), num_edges[etype]
+                )
+            )
+            assert g.num_edges(etype) == num_edges[etype]
+
+
+def _verify_edge_id_range_hetero(
+    g,
+    part,
+    eids,
+):
+    """
+    check list:
+        make sure inner_eids fall into a range.
+        make sure all edges are included.
+    """
+    edata = (
+        part.edge_attributes
+        if isinstance(part, gb.FusedCSCSamplingGraph)
+        else part.edata
+    )
+    etype = (
+        part.type_per_edge
+        if isinstance(part, gb.FusedCSCSamplingGraph)
+        else edata[dgl.ETYPE]
+    )
+    eid = torch.arange(len(edata[dgl.EID]))
+    etype_arr = F.gather_row(etype, eid)
+    eid_arr = F.gather_row(edata[dgl.EID], eid)
+    for etype in g.canonical_etypes:
+        etype_id = g.get_etype_id(etype)
+        eids[etype].append(F.boolean_mask(eid_arr, etype_arr == etype_id))
+        # Make sure edge Ids fall into a range.
+        inner_edge_mask = _get_inner_edge_mask(part, etype_id)
+        inner_eids = np.sort(
+            F.asnumpy(F.boolean_mask(edata[dgl.EID], inner_edge_mask))
+        )
+        assert np.all(
+            inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1)
+        )
+    return eids
+
+
+def _verify_node_id_range_hetero(g, part, nids):
+    """
+    check list:
+        make sure inner nodes have Ids fall into a range.
+    """
+    for ntype in g.ntypes:
+        ntype_id = g.get_ntype_id(ntype)
+        # Make sure inner nodes have Ids fall into a range.
+        inner_node_mask = _get_inner_node_mask(part, ntype_id)
+        inner_nids = F.boolean_mask(
+            part.node_attributes[dgl.NID], inner_node_mask
+        )
+        assert np.all(
+            F.asnumpy(
+                inner_nids
+                == F.arange(
+                    F.as_scalar(inner_nids[0]),
+                    F.as_scalar(inner_nids[-1]) + 1,
+                )
+            )
+        )
+        nids[ntype].append(inner_nids)
+    return nids
+
+
+def _verify_graph_attributes_hetero(
+    g,
+    parts,
+    store_inner_edge,
+    store_inner_node,
+):
+    """
+    check list:
+        make sure edge ids fall into a range.
+        make sure inner nodes have Ids fall into a range.
+        make sure all nodes is included.
+        make sure all edges is included.
+    """
+    nids = {ntype: [] for ntype in g.ntypes}
+    eids = {etype: [] for etype in g.canonical_etypes}
+    # check edge id.
+    if store_inner_edge or isinstance(parts[0], dgl.DGLGraph):
+        for part in parts:
+            # collect eids
+            eids = _verify_edge_id_range_hetero(g, part, eids)
+        for etype in eids:
+            eids_type = F.cat(eids[etype], 0)
+            uniq_ids = F.unique(eids_type)
+            # We should get all nodes.
+            assert len(uniq_ids) == g.num_edges(etype)
+
+    # check node id.
+    if store_inner_node or isinstance(parts[0], dgl.DGLGraph):
+        for part in parts:
+            nids = _verify_node_id_range_hetero(g, part, nids)
+        for ntype in nids:
+            nids_type = F.cat(nids[ntype], 0)
+            uniq_ids = F.unique(nids_type)
+            # We should get all nodes.
+            assert len(uniq_ids) == g.num_nodes(ntype)
+
+
+def _verify_hetero_graph(
+    g,
+    parts,
+    store_eids=False,
+    store_inner_edge=False,
+    store_inner_node=False,
+    debug_mode=False,
+):
+    _verify_hetero_graph_node_edge_num(
+        g,
+        parts,
+        store_inner_edge=store_inner_edge,
+        debug_mode=debug_mode,
+    )
+    if store_eids:
+        _verify_graph_attributes_hetero(
+            g,
+            parts,
+            store_inner_edge=store_inner_edge,
+            store_inner_node=store_inner_node,
+        )
+
+
+def _test_pipeline_graphbolt(
+    num_chunks,
+    num_parts,
+    world_size,
+    graph_formats=None,
+    data_fmt="numpy",
+    num_chunks_nodes=None,
+    num_chunks_edges=None,
+    num_chunks_node_data=None,
+    num_chunks_edge_data=None,
+    use_verify_partitions=False,
+    store_eids=True,
+    store_inner_edge=True,
+    store_inner_node=True,
+):
+    if num_parts % world_size != 0:
+        # num_parts should be a multiple of world_size
+        return
+
+    with tempfile.TemporaryDirectory() as root_dir:
+        g = create_chunked_dataset(
+            root_dir,
+            num_chunks,
+            data_fmt=data_fmt,
+            num_chunks_nodes=num_chunks_nodes,
+            num_chunks_edges=num_chunks_edges,
+            num_chunks_node_data=num_chunks_node_data,
+            num_chunks_edge_data=num_chunks_edge_data,
+        )
+        graph_name = "test"
+        test_ntype = "paper"
+        test_etype = ("paper", "cites", "paper")
+
+        # Step1: graph partition
+        in_dir = os.path.join(root_dir, "chunked-data")
+        output_dir = os.path.join(root_dir, "parted_data")
+        os.system(
+            "python3 tools/partition_algo/random_partition.py "
+            "--in_dir {} --out_dir {} --num_partitions {}".format(
+                in_dir, output_dir, num_parts
+            )
+        )
+        for ntype in ["author", "institution", "paper"]:
+            fname = os.path.join(output_dir, "{}.txt".format(ntype))
+            with open(fname, "r") as f:
+                header = f.readline().rstrip()
+                assert isinstance(int(header), int)
+
+        # Step2: data dispatch
+        partition_dir = os.path.join(root_dir, "parted_data")
+        out_dir = os.path.join(root_dir, "partitioned")
+        ip_config = os.path.join(root_dir, "ip_config.txt")
+        with open(ip_config, "w") as f:
+            for i in range(world_size):
+                f.write(f"127.0.0.{i + 1}\n")
+
+        cmd = "python3 tools/dispatch_data.py "
+        cmd += f" --in-dir {in_dir} "
+        cmd += f" --partitions-dir {partition_dir} "
+        cmd += f" --out-dir {out_dir} "
+        cmd += f" --ip-config {ip_config} "
+        cmd += " --ssh-port 22 "
+        cmd += " --process-group-timeout 60 "
+        cmd += " --save-orig-nids "
+        cmd += " --save-orig-eids "
+        cmd += " --use-graphbolt "
+        cmd += f" --graph-formats {graph_formats} " if graph_formats else ""
+
+        if store_eids:
+            cmd += " --store-eids "
+        if store_inner_edge:
+            cmd += " --store-inner-edge "
+        if store_inner_node:
+            cmd += " --store-inner-node "
+        os.system(cmd)
+
+        # check if verify_partitions.py is used for validation.
+        if use_verify_partitions:
+            cmd = "python3 tools/verify_partitions.py "
+            cmd += f" --orig-dataset-dir {in_dir}"
+            cmd += f" --part-graph {out_dir}"
+            cmd += f" --partitions-dir {output_dir}"
+            os.system(cmd)
+            return
+
+        # read original node/edge IDs
+        def read_orig_ids(fname):
+            orig_ids = {}
+            for i in range(num_parts):
+                ids_path = os.path.join(out_dir, f"part{i}", fname)
+                part_ids = load_tensors(ids_path)
+                for type, data in part_ids.items():
+                    if type not in orig_ids:
+                        orig_ids[type] = data
+                    else:
+                        orig_ids[type] = torch.cat((orig_ids[type], data))
+            return orig_ids
+
+        orig_nids, orig_eids = None, None
+        orig_nids = read_orig_ids("orig_nids.dgl")
+
+        orig_eids_str = read_orig_ids("orig_eids.dgl")
+
+        orig_eids = {}
+        # transmit etype from string to tuple.
+        for etype, eids in orig_eids_str.items():
+            orig_eids[_etype_str_to_tuple(etype)] = eids
+
+        # load partitions and verify
+        part_config = os.path.join(out_dir, "metadata.json")
+        parts = _verify_graphbolt_part(
+            g,
+            root_dir,
+            orig_nids,
+            orig_eids,
+            graph_name,
+            num_parts,
+            store_inner_node,
+            store_inner_edge,
+            store_eids,
+            test_ntype=test_ntype,
+            test_etype=test_etype,
+            part_config=part_config,
+            is_homo=False,
+        )
+        _verify_hetero_graph(
+            g,
+            parts,
+            store_eids=store_eids,
+            store_inner_edge=store_inner_edge,
+        )
+
+
+@pytest.mark.parametrize(
+    "num_chunks, num_parts, world_size",
+    [[4, 4, 4], [8, 4, 2], [8, 4, 4], [9, 6, 3], [11, 11, 1], [11, 4, 1]],
+)
+def test_pipeline_basics(num_chunks, num_parts, world_size):
+    _test_pipeline_graphbolt(
+        num_chunks,
+        num_parts,
+        world_size,
+    )
+    _test_pipeline_graphbolt(
+        num_chunks, num_parts, world_size, use_verify_partitions=False
+    )
+
+
+@pytest.mark.parametrize("store_inner_node", [True, False])
+@pytest.mark.parametrize("store_inner_edge", [True, False])
+@pytest.mark.parametrize("store_eids", [True, False])
+def test_pipeline_attributes(store_inner_node, store_inner_edge, store_eids):
+    _test_pipeline_graphbolt(
+        4,
+        4,
+        4,
+        store_inner_node=store_inner_node,
+        store_inner_edge=store_inner_edge,
+        store_eids=store_eids,
+    )
+
+
+@pytest.mark.parametrize(
+    "num_chunks, "
+    "num_parts, "
+    "world_size, "
+    "num_chunks_node_data, "
+    "num_chunks_edge_data",
+    [
+        # Test cases where no. of chunks more than
+        # no. of partitions
+        [8, 4, 4, 8, 8],
+        [8, 4, 2, 8, 8],
+        [9, 7, 5, 9, 9],
+        [8, 8, 4, 8, 8],
+        # Test cases where no. of chunks smaller
+        # than no. of partitions
+        [7, 8, 4, 7, 7],
+        [1, 8, 4, 1, 1],
+        [1, 4, 4, 1, 1],
+        [3, 4, 4, 3, 3],
+        [1, 4, 2, 1, 1],
+        [3, 4, 2, 3, 3],
+        [1, 5, 3, 1, 1],
+    ],
+)
+def test_pipeline_arbitrary_chunks(
+    num_chunks,
+    num_parts,
+    world_size,
+    num_chunks_node_data,
+    num_chunks_edge_data,
+):
+
+    _test_pipeline_graphbolt(
+        num_chunks,
+        num_parts,
+        world_size,
+        num_chunks_node_data=num_chunks_node_data,
+        num_chunks_edge_data=num_chunks_edge_data,
+    )
+
+
+@pytest.mark.parametrize("data_fmt", ["numpy", "parquet"])
+def test_pipeline_feature_format(data_fmt):
+    _test_pipeline_graphbolt(4, 4, 4, data_fmt=data_fmt)
diff --git a/tools/dispatch_data.py b/tools/dispatch_data.py
index 3cf1d0fbf224..b2b54e51a6ec 100644
--- a/tools/dispatch_data.py
+++ b/tools/dispatch_data.py
@@ -75,6 +75,10 @@ def submit_jobs(args) -> str:
     argslist += "--log-level {} ".format(args.log_level)
     argslist += "--save-orig-nids " if args.save_orig_nids else ""
     argslist += "--save-orig-eids " if args.save_orig_eids else ""
+    argslist += "--use-graphbolt " if args.use_graphbolt else ""
+    argslist += "--store-eids " if args.store_eids else ""
+    argslist += "--store-inner-node " if args.store_inner_node else ""
+    argslist += "--store-inner-edge " if args.store_inner_edge else ""
     argslist += (
         f"--graph-formats {args.graph_formats} " if args.graph_formats else ""
     )
@@ -159,6 +163,30 @@ def main():
         action="store_true",
         help="Save original edge IDs into files",
     )
+    parser.add_argument(
+        "--use-graphbolt",
+        action="store_true",
+        help="Use GraphBolt for distributed partition.",
+    )
+    parser.add_argument(
+        "--store-inner-node",
+        action="store_true",
+        default=False,
+        help="Store inner nodes.",
+    )
+
+    parser.add_argument(
+        "--store-inner-edge",
+        action="store_true",
+        default=False,
+        help="Store inner edges.",
+    )
+    parser.add_argument(
+        "--store-eids",
+        action="store_true",
+        default=False,
+        help="Store edge IDs.",
+    )
     parser.add_argument(
         "--graph-formats",
         type=str,
diff --git a/tools/distpartitioning/convert_partition.py b/tools/distpartitioning/convert_partition.py
index a169589a3f97..5013b6d40f20 100644
--- a/tools/distpartitioning/convert_partition.py
+++ b/tools/distpartitioning/convert_partition.py
@@ -1,24 +1,25 @@
-import argparse
+import copy
 import gc
-import json
 import logging
 import os
-import time
 
 import constants
-
 import dgl
+import dgl.backend as F
+import dgl.graphbolt as gb
 import numpy as np
-import pandas as pd
-import pyarrow
 import torch as th
+from dgl import EID, ETYPE, NID, NTYPE
+
+from dgl.distributed.constants import DGL2GB_EID, GB_DST_ID
 from dgl.distributed.partition import (
+    _cast_to_minimum_dtype,
     _etype_str_to_tuple,
     _etype_tuple_to_str,
+    cast_various_to_minimum_dtype_gb,
     RESERVED_FIELD_DTYPE,
 )
-from pyarrow import csv
-from utils import get_idranges, memory_snapshot, read_json
+from utils import get_idranges, memory_snapshot
 
 
 def _get_unique_invidx(srcids, dstids, nids, low_mem=True):
@@ -164,7 +165,202 @@ def _get_unique_invidx(srcids, dstids, nids, low_mem=True):
     return uniques, idxes, srcids, dstids
 
 
-def create_dgl_object(
+# Utility functions.
+def _is_homogeneous(ntypes, etypes):
+    """Checks if the provided ntypes and etypes form a homogeneous graph."""
+    return len(ntypes) == 1 and len(etypes) == 1
+
+
+def _coo2csc(src_ids, dst_ids):
+    src_ids, dst_ids = th.tensor(src_ids, dtype=th.int64), th.tensor(
+        dst_ids, dtype=th.int64
+    )
+    num_nodes = th.max(th.stack([src_ids, dst_ids], dim=0)).item() + 1
+    dst, idx = dst_ids.sort()
+    indptr = th.searchsorted(dst, th.arange(num_nodes + 1))
+    indices = src_ids[idx]
+    return indptr, indices, idx
+
+
+def _create_edge_data(edgeid_offset, etype_ids, num_edges):
+    eid = th.arange(
+        edgeid_offset,
+        edgeid_offset + num_edges,
+        dtype=RESERVED_FIELD_DTYPE[dgl.EID],
+    )
+    etype = th.as_tensor(etype_ids, dtype=RESERVED_FIELD_DTYPE[dgl.ETYPE])
+    inner_edge = th.ones(num_edges, dtype=RESERVED_FIELD_DTYPE["inner_edge"])
+    return eid, etype, inner_edge
+
+
+def _create_node_data(ntype, uniq_ids, reshuffle_nodes, inner_nodes):
+    node_type = th.as_tensor(ntype, dtype=RESERVED_FIELD_DTYPE[dgl.NTYPE])
+    node_id = th.as_tensor(uniq_ids[reshuffle_nodes])
+    inner_node = th.as_tensor(
+        inner_nodes[reshuffle_nodes],
+        dtype=RESERVED_FIELD_DTYPE["inner_node"],
+    )
+    return node_type, node_id, inner_node
+
+
+def _compute_node_ntype(
+    global_src_id, global_dst_id, global_homo_nid, idx, reshuffle_nodes, id_map
+):
+    global_ids = np.concatenate([global_src_id, global_dst_id, global_homo_nid])
+    part_global_ids = global_ids[idx]
+    part_global_ids = part_global_ids[reshuffle_nodes]
+    ntype, per_type_ids = id_map(part_global_ids)
+    return ntype, per_type_ids
+
+
+def _graph_orig_ids(
+    return_orig_nids,
+    return_orig_eids,
+    ntypes_map,
+    etypes_map,
+    node_attr,
+    edge_attr,
+    per_type_ids,
+    type_per_edge,
+    global_edge_id,
+):
+    orig_nids = None
+    orig_eids = None
+    if return_orig_nids:
+        orig_nids = {}
+        for ntype, ntype_id in ntypes_map.items():
+            mask = th.logical_and(
+                node_attr[dgl.NTYPE] == ntype_id,
+                node_attr["inner_node"],
+            )
+            orig_nids[ntype] = th.as_tensor(per_type_ids[mask])
+    if return_orig_eids:
+        orig_eids = {}
+        for etype, etype_id in etypes_map.items():
+            mask = th.logical_and(
+                type_per_edge == etype_id,
+                edge_attr["inner_edge"],
+            )
+            orig_eids[_etype_tuple_to_str(etype)] = th.as_tensor(
+                global_edge_id[mask]
+            )
+    return orig_nids, orig_eids
+
+
+def _create_edge_attr_gb(
+    part_local_dst_id, edgeid_offset, etype_ids, ntypes, etypes, etypes_map
+):
+    edge_attr = {}
+    # create edge data in graph.
+    num_edges = len(part_local_dst_id)
+    (
+        edge_attr[dgl.EID],
+        type_per_edge,
+        edge_attr["inner_edge"],
+    ) = _create_edge_data(edgeid_offset, etype_ids, num_edges)
+    assert "inner_edge" in edge_attr
+
+    is_homo = _is_homogeneous(ntypes, etypes)
+
+    edge_type_to_id = (
+        {gb.etype_tuple_to_str(("_N", "_E", "_N")): 0}
+        if is_homo
+        else {
+            gb.etype_tuple_to_str(etype): etid
+            for etype, etid in etypes_map.items()
+        }
+    )
+    return edge_attr, type_per_edge, edge_type_to_id
+
+
+def _create_node_attr(
+    idx,
+    global_src_id,
+    global_dst_id,
+    global_homo_nid,
+    uniq_ids,
+    reshuffle_nodes,
+    id_map,
+    inner_nodes,
+):
+    # compute per_type_ids and ntype for all the nodes in the graph.
+    ntype, per_type_ids = _compute_node_ntype(
+        global_src_id,
+        global_dst_id,
+        global_homo_nid,
+        idx,
+        reshuffle_nodes,
+        id_map,
+    )
+
+    # create node data in graph.
+    node_attr = {}
+    (
+        node_attr[dgl.NTYPE],
+        node_attr[dgl.NID],
+        node_attr["inner_node"],
+    ) = _create_node_data(ntype, uniq_ids, reshuffle_nodes, inner_nodes)
+    return node_attr, per_type_ids
+
+
+def remove_attr_gb(
+    edge_attr, node_attr, store_inner_node, store_inner_edge, store_eids
+):
+    edata, ndata = copy.deepcopy(edge_attr), copy.deepcopy(node_attr)
+    if not store_inner_edge:
+        assert "inner_edge" in edata
+        edata.pop("inner_edge")
+
+    if not store_eids:
+        assert dgl.EID in edata
+        edata.pop(dgl.EID)
+
+    if not store_inner_node:
+        assert "inner_node" in ndata
+        ndata.pop("inner_node")
+    return edata, ndata
+
+
+def _process_partition_gb(
+    node_attr,
+    edge_attr,
+    type_per_edge,
+    src_ids,
+    dst_ids,
+    sort_etypes,
+):
+    """Preprocess partitions before saving:
+    1. format data types.
+    2. sort csc/csr by tag.
+    """
+    for k, dtype in RESERVED_FIELD_DTYPE.items():
+        if k in node_attr:
+            node_attr[k] = F.astype(node_attr[k], dtype)
+        if k in edge_attr:
+            edge_attr[k] = F.astype(edge_attr[k], dtype)
+
+    indptr, indices, edge_ids = _coo2csc(src_ids, dst_ids)
+    if sort_etypes:
+        split_size = th.diff(indptr)
+        split_indices = th.split(type_per_edge, tuple(split_size), dim=0)
+        sorted_idxs = []
+        for split_indice in split_indices:
+            sorted_idxs.append(split_indice.sort()[1])
+
+        sorted_idx = th.cat(sorted_idxs, dim=0)
+        sorted_idx = (
+            th.repeat_interleave(indptr[:-1], split_size, dim=0) + sorted_idx
+        )
+
+    return indptr, indices[sorted_idx], edge_ids[sorted_idx]
+
+
+def create_graph_object(
+    tot_node_count,
+    tot_edge_count,
+    node_count,
+    edge_count,
+    num_parts,
     schema,
     part_id,
     node_data,
@@ -174,6 +370,8 @@ def create_dgl_object(
     edge_typecounts,
     return_orig_nids=False,
     return_orig_eids=False,
+    use_graphbolt=False,
+    **kwargs,
 ):
     """
     This function creates dgl objects for a given graph partition, as in function
@@ -223,6 +421,18 @@ def create_dgl_object(
 
     Parameters:
     -----------
+    tot_node_count : int
+        the number of all nodes
+    tot_edge_count : int
+        the number of all edges
+    node_count : int
+        the number of nodes in partition
+    edge_count : int
+        the number of edges in partition
+    graph_formats : str
+        the format of graph
+    num_parts : int
+        the number of parts
     schame : json object
         json object created by reading the graph metadata json file
     part_id : int
@@ -449,58 +659,134 @@ def create_dgl_object(
         nid_map[part_local_dst_id],
     )
 
-    # create the graph here now.
-    part_graph = dgl.graph(
-        data=(part_local_src_id, part_local_dst_id), num_nodes=len(uniq_ids)
-    )
-    part_graph.edata[dgl.EID] = th.arange(
-        edgeid_offset,
-        edgeid_offset + part_graph.num_edges(),
-        dtype=th.int64,
-    )
-    part_graph.edata[dgl.ETYPE] = th.as_tensor(
-        etype_ids, dtype=RESERVED_FIELD_DTYPE[dgl.ETYPE]
-    )
-    part_graph.edata["inner_edge"] = th.ones(
-        part_graph.num_edges(), dtype=RESERVED_FIELD_DTYPE["inner_edge"]
-    )
+    """
+    Creating attributes for graphbolt and DGLGraph is as follows.
 
-    # compute per_type_ids and ntype for all the nodes in the graph.
-    global_ids = np.concatenate([global_src_id, global_dst_id, global_homo_nid])
-    part_global_ids = global_ids[idx]
-    part_global_ids = part_global_ids[reshuffle_nodes]
-    ntype, per_type_ids = id_map(part_global_ids)
+    node attributes:    
+        this part is implemented in _create_node_attr.
+        compute the ntype and per type ids for each node with global node type id.
+        create ntype, nid and inner node with orig ntype and inner nodes
+    this part is shared by graphbolt and DGLGraph.
 
-    # continue with the graph creation
-    part_graph.ndata[dgl.NTYPE] = th.as_tensor(
-        ntype, dtype=RESERVED_FIELD_DTYPE[dgl.NTYPE]
-    )
-    part_graph.ndata[dgl.NID] = th.as_tensor(uniq_ids[reshuffle_nodes])
-    part_graph.ndata["inner_node"] = th.as_tensor(
-        inner_nodes[reshuffle_nodes], dtype=RESERVED_FIELD_DTYPE["inner_node"]
+    the attributes created for graphbolt are as follows:
+
+    edge attributes:
+        this part is implemented in _create_edge_attr_gb.
+        create eid, type per edge and inner edge with edgeid_offset.
+        create edge_type_to_id with etypes_map.
+    
+    The process to remove extra attribute is implemented in  remove_attr_gb.
+    the unused attributes like inner_node, inner_edge, eids will be removed following the arguments in kwargs.
+    edge_attr, node_attr are the variable that have removed extra attributes to construct csc_graph.
+    edata, ndata are the variable that reserve extra attributes to be used to generate orig_nid and orig_eid. 
+    
+    the src_ids and dst_ids will be transformed into indptr and indices in _coo2csc.
+
+    all variable mentioned above will be casted to minimum data type in cast_various_to_minimum_dtype_gb.
+
+    orig_nids and orig_eids will be generated in _graph_orig_ids with ndata and edata.
+    """
+    # create the graph here now.
+    ndata, per_type_ids = _create_node_attr(
+        idx,
+        global_src_id,
+        global_dst_id,
+        global_homo_nid,
+        uniq_ids,
+        reshuffle_nodes,
+        id_map,
+        inner_nodes,
     )
+    if use_graphbolt:
+        edata, type_per_edge, edge_type_to_id = _create_edge_attr_gb(
+            part_local_dst_id,
+            edgeid_offset,
+            etype_ids,
+            ntypes,
+            etypes,
+            etypes_map,
+        )
 
-    orig_nids = None
-    orig_eids = None
-    if return_orig_nids:
-        orig_nids = {}
-        for ntype, ntype_id in ntypes_map.items():
-            mask = th.logical_and(
-                part_graph.ndata[dgl.NTYPE] == ntype_id,
-                part_graph.ndata["inner_node"],
-            )
-            orig_nids[ntype] = th.as_tensor(per_type_ids[mask])
-    if return_orig_eids:
-        orig_eids = {}
-        for etype, etype_id in etypes_map.items():
-            mask = th.logical_and(
-                part_graph.edata[dgl.ETYPE] == etype_id,
-                part_graph.edata["inner_edge"],
-            )
-            orig_eids[_etype_tuple_to_str(etype)] = th.as_tensor(
-                global_edge_id[mask]
-            )
+        assert edata is not None
+        assert ndata is not None
+
+        sort_etypes = len(etypes_map) > 1
+        indptr, indices, csc_edge_ids = _process_partition_gb(
+            ndata,
+            edata,
+            type_per_edge,
+            part_local_src_id,
+            part_local_dst_id,
+            sort_etypes,
+        )
+        edge_attr, node_attr = remove_attr_gb(
+            edge_attr=edata, node_attr=ndata, **kwargs
+        )
+        edge_attr = {
+            attr: edge_attr[attr][csc_edge_ids] for attr in edge_attr.keys()
+        }
+        cast_various_to_minimum_dtype_gb(
+            node_count=node_count,
+            edge_count=edge_count,
+            tot_node_count=tot_node_count,
+            tot_edge_count=tot_edge_count,
+            num_parts=num_parts,
+            indptr=indptr,
+            indices=indices,
+            type_per_edge=type_per_edge,
+            etypes=etypes,
+            ntypes=ntypes,
+            node_attributes=node_attr,
+            edge_attributes=edge_attr,
+        )
+        part_graph = gb.fused_csc_sampling_graph(
+            csc_indptr=indptr,
+            indices=indices,
+            node_type_offset=None,
+            type_per_edge=type_per_edge[csc_edge_ids],
+            node_attributes=node_attr,
+            edge_attributes=edge_attr,
+            node_type_to_id=ntypes_map,
+            edge_type_to_id=edge_type_to_id,
+        )
+    else:
+        num_edges = len(part_local_dst_id)
+        part_graph = dgl.graph(
+            data=(part_local_src_id, part_local_dst_id), num_nodes=len(uniq_ids)
+        )
+        # create edge data in graph.
+        (
+            part_graph.edata[dgl.EID],
+            part_graph.edata[dgl.ETYPE],
+            part_graph.edata["inner_edge"],
+        ) = _create_edge_data(edgeid_offset, etype_ids, num_edges)
 
+        ndata, per_type_ids = _create_node_attr(
+            idx,
+            global_src_id,
+            global_dst_id,
+            global_homo_nid,
+            uniq_ids,
+            reshuffle_nodes,
+            id_map,
+            inner_nodes,
+        )
+        for attr_name, node_attributes in ndata.items():
+            part_graph.ndata[attr_name] = node_attributes
+        type_per_edge = part_graph.edata[dgl.ETYPE]
+        ndata, edata = part_graph.ndata, part_graph.edata
+    # get the original node ids and edge ids from original graph.
+    orig_nids, orig_eids = _graph_orig_ids(
+        return_orig_nids,
+        return_orig_eids,
+        ntypes_map,
+        etypes_map,
+        ndata,
+        edata,
+        per_type_ids,
+        type_per_edge,
+        global_edge_id,
+    )
     return (
         part_graph,
         node_map_val,
@@ -523,6 +809,7 @@ def create_metadata_json(
     ntypes_map,
     etypes_map,
     output_dir,
+    use_graphbolt,
 ):
     """
     Auxiliary function to create json file for the graph partition metadata
@@ -549,6 +836,8 @@ def create_metadata_json(
         map between edge type(string)  and edge_type_id(int)
     output_dir : string
         directory where the output files are to be stored
+    use_graphbolt : bool
+        whether to use graphbolt or not
 
     Returns:
     --------
@@ -572,10 +861,14 @@ def create_metadata_json(
     part_dir = "part" + str(part_id)
     node_feat_file = os.path.join(part_dir, "node_feat.dgl")
     edge_feat_file = os.path.join(part_dir, "edge_feat.dgl")
-    part_graph_file = os.path.join(part_dir, "graph.dgl")
+    if use_graphbolt:
+        part_graph_file = os.path.join(part_dir, "fused_csc_sampling_graph.pt")
+    else:
+        part_graph_file = os.path.join(part_dir, "graph.dgl")
+    part_graph_type = "part_graph_graphbolt" if use_graphbolt else "part_graph"
     part_metadata["part-{}".format(part_id)] = {
         "node_feats": node_feat_file,
         "edge_feats": edge_feat_file,
-        "part_graph": part_graph_file,
+        part_graph_type: part_graph_file,
     }
     return part_metadata
diff --git a/tools/distpartitioning/data_proc_pipeline.py b/tools/distpartitioning/data_proc_pipeline.py
index 4c249a34b6b2..e0159f55b9a1 100644
--- a/tools/distpartitioning/data_proc_pipeline.py
+++ b/tools/distpartitioning/data_proc_pipeline.py
@@ -94,6 +94,30 @@ def log_params(params):
         action="store_true",
         help="Save original edge IDs into files",
     )
+    parser.add_argument(
+        "--use-graphbolt",
+        action="store_true",
+        help="Use GraphBolt for distributed partition.",
+    )
+    parser.add_argument(
+        "--store-inner-node",
+        action="store_true",
+        default=False,
+        help="Store inner nodes.",
+    )
+
+    parser.add_argument(
+        "--store-inner-edge",
+        action="store_true",
+        default=False,
+        help="Store inner edges.",
+    )
+    parser.add_argument(
+        "--store-eids",
+        action="store_true",
+        default=False,
+        help="Store edge IDs.",
+    )
     parser.add_argument(
         "--graph-formats",
         default=None,
@@ -101,7 +125,6 @@ def log_params(params):
         help="Save partitions in specified formats.",
     )
     params = parser.parse_args()
-
     # invoke the pipeline function
     numeric_level = getattr(logging, params.log_level.upper(), None)
     logging.basicConfig(
diff --git a/tools/distpartitioning/data_shuffle.py b/tools/distpartitioning/data_shuffle.py
index 7cba2cbeecda..6800064a2b0b 100644
--- a/tools/distpartitioning/data_shuffle.py
+++ b/tools/distpartitioning/data_shuffle.py
@@ -13,7 +13,7 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from convert_partition import create_dgl_object, create_metadata_json
+from convert_partition import create_graph_object, create_metadata_json
 from dataset_utils import get_dataset
 from dist_lookup import DistLookupService
 from globalids import (
@@ -1121,7 +1121,6 @@ def gen_dist_partitions(rank, world_size, params):
     )
     id_map = dgl.distributed.id_map.IdMap(global_nid_ranges)
     id_lookup.set_idMap(id_map)
-
     # read input graph files and augment these datastructures with
     # appropriate information (global_nid and owner process) for node and edge data
     (
@@ -1315,6 +1314,8 @@ def prepare_local_data(src_data, local_part_id):
         )
         local_node_data = prepare_local_data(node_data, local_part_id)
         local_edge_data = prepare_local_data(edge_data, local_part_id)
+        tot_node_count = sum(schema_map["num_nodes_per_type"])
+        tot_edge_count = sum(schema_map["num_edges_per_type"])
         (
             graph_obj,
             ntypes_map_val,
@@ -1323,7 +1324,12 @@ def prepare_local_data(src_data, local_part_id):
             etypes_map,
             orig_nids,
             orig_eids,
-        ) = create_dgl_object(
+        ) = create_graph_object(
+            tot_node_count,
+            tot_edge_count,
+            node_count,
+            edge_count,
+            params.num_parts,
             schema_map,
             rank + local_part_id * world_size,
             local_node_data,
@@ -1334,8 +1340,12 @@ def prepare_local_data(src_data, local_part_id):
                 schema_map[constants.STR_NUM_NODES_PER_TYPE],
             ),
             edge_typecounts,
-            params.save_orig_nids,
-            params.save_orig_eids,
+            return_orig_nids=params.save_orig_nids,
+            return_orig_eids=params.save_orig_eids,
+            use_graphbolt=params.use_graphbolt,
+            store_inner_node=params.store_inner_node,
+            store_inner_edge=params.store_inner_edge,
+            store_eids=params.store_eids,
         )
         sort_etypes = len(etypes_map) > 1
         local_node_features = prepare_local_data(
@@ -1354,8 +1364,12 @@ def prepare_local_data(src_data, local_part_id):
             orig_eids,
             graph_formats,
             sort_etypes,
+            params.use_graphbolt,
         )
-        memory_snapshot("DiskWriteDGLObjectsComplete: ", rank)
+        if params.use_graphbolt:
+            memory_snapshot("DiskWriteGrapgboltObjectsComplete: ", rank)
+        else:
+            memory_snapshot("DiskWriteDGLObjectsComplete: ", rank)
 
         # get the meta-data
         json_metadata = create_metadata_json(
@@ -1369,6 +1383,7 @@ def prepare_local_data(src_data, local_part_id):
             ntypes_map,
             etypes_map,
             params.output,
+            params.use_graphbolt,
         )
         output_meta_json[
             "local-part-id-" + str(local_part_id * world_size + rank)
diff --git a/tools/distpartitioning/utils.py b/tools/distpartitioning/utils.py
index cdb984be3796..32292a843bc5 100644
--- a/tools/distpartitioning/utils.py
+++ b/tools/distpartitioning/utils.py
@@ -504,6 +504,20 @@ def write_edge_features(edge_features, edge_file):
     dgl.data.utils.save_tensors(edge_file, edge_features)
 
 
+def write_graph_graghbolt(graph_file, graph_obj):
+    """
+    Utility function to serialize FusedCSCSamplingGraph
+
+    Parameters:
+    -----------
+    graph_obj : FusedCSCSamplingGraph
+        FusedCSCSamplingGraph, as created in convert_partition.py, which is to be serialized
+    graph_file : string
+        File name in which graph object is serialized
+    """
+    torch.save(graph_obj, graph_file)
+
+
 def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes):
     """
     Utility function to serialize graph dgl objects
@@ -519,9 +533,23 @@ def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes):
     sort_etypes : bool
         Whether to sort etypes in csc/csr.
     """
-    dgl.distributed.partition._save_graphs(
-        graph_file, [graph_obj], formats, sort_etypes
+    dgl.distributed.partition.process_partitions(
+        graph_obj, formats, sort_etypes
     )
+    dgl.save_graphs(graph_file, [graph_obj], formats=formats)
+
+
+def _write_graph(
+    part_dir, graph_obj, formats=None, sort_etypes=None, use_graphbolt=False
+):
+    if use_graphbolt:
+        write_graph_graghbolt(
+            os.path.join(part_dir, "fused_csc_sampling_graph.pt"), graph_obj
+        )
+    else:
+        write_graph_dgl(
+            os.path.join(part_dir, "graph.dgl"), graph_obj, formats, sort_etypes
+        )
 
 
 def write_dgl_objects(
@@ -534,6 +562,7 @@ def write_dgl_objects(
     orig_eids,
     formats,
     sort_etypes,
+    use_graphbolt,
 ):
     """
     Wrapper function to write graph, node/edge feature, original node/edge IDs.
@@ -558,13 +587,18 @@ def write_dgl_objects(
         Save graph in formats.
     sort_etypes : bool
         Whether to sort etypes in csc/csr.
+    use_graphbolt : bool
+        Whether to use graphbolt or not.
     """
     part_dir = output_dir + "/part" + str(part_id)
     os.makedirs(part_dir, exist_ok=True)
-    write_graph_dgl(
-        os.path.join(part_dir, "graph.dgl"), graph_obj, formats, sort_etypes
+    _write_graph(
+        part_dir,
+        graph_obj,
+        formats=formats,
+        sort_etypes=sort_etypes,
+        use_graphbolt=use_graphbolt,
     )
-
     if node_features != None:
         write_node_features(
             node_features, os.path.join(part_dir, "node_feat.dgl")

From 31ad9b5a8d88f97742d2bb4f43809483fb57a71c Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 19 Sep 2024 14:57:42 -0400
Subject: [PATCH 09/10] [GraphBolt][CUDA] Fix Cooperative Minibatching bugs.
 (#7804)

---
 .../cuda/cooperative_minibatching_utils.cu    | 12 ++++++--
 .../src/cuda/cooperative_minibatching_utils.h | 29 ++++++++++---------
 .../cuda/extension/unique_and_compact_map.cu  |  8 +++--
 python/dgl/graphbolt/impl/cooperative_conv.py |  6 ++--
 .../test_cooperative_minibatching_utils.py    | 10 +++----
 5 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index fb9858f6d559..e192de458650 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -19,6 +19,7 @@
  * implementations in CUDA.
  */
 #include <graphbolt/cuda_ops.h>
+#include <thrust/scatter.h>
 #include <thrust/transform.h>
 
 #include <cub/cub.cuh>
@@ -62,8 +63,7 @@ RankSortImpl(
   auto part_ids2 = part_ids.clone();
   auto part_ids2_sorted = torch::empty_like(part_ids2);
   auto nodes_sorted = torch::empty_like(nodes);
-  auto index = ops::IndptrEdgeIdsImpl(
-      offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
+  auto index = torch::arange(nodes.numel(), nodes.options());
   auto index_sorted = torch::empty_like(index);
   return AT_DISPATCH_INDEX_TYPES(
       nodes.scalar_type(), "RankSortImpl", ([&] {
@@ -100,8 +100,14 @@ RankSortImpl(
             index.data_ptr<index_t>(), index_sorted.data_ptr<index_t>(),
             nodes.numel(), num_batches, offsets_dev_ptr, offsets_dev_ptr + 1, 0,
             num_bits);
+        auto values = ops::IndptrEdgeIdsImpl(
+            offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel());
+        THRUST_CALL(
+            scatter, values.data_ptr<index_t>(),
+            values.data_ptr<index_t>() + values.numel(),
+            index_sorted.data_ptr<index_t>(), index.data_ptr<index_t>());
         return std::make_tuple(
-            nodes_sorted, index_sorted, offsets, std::move(offsets_event));
+            nodes_sorted, index, offsets, std::move(offsets_event));
       }));
 }
 
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
index 45bd203f1f71..efe2b5b28bf1 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.h
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -42,21 +42,21 @@ torch::Tensor RankAssignment(
 
 /**
  * @brief Given node ids, the ranks they belong, the offsets to separate
- * different node types and num_bits indicating the world size is <= 2^num_bits,
- * returns node ids sorted w.r.t. the ranks that the given ids belong along with
- * the original positions.
+ * different node types and world size, returns node ids sorted w.r.t. the ranks
+ * that the given ids belong along with their new positions.
  *
  * @param nodes        Node id tensor to be mapped to a rank in [0, world_size).
  * @param part_ids     Rank tensor the nodes belong to.
  * @param offsets_dev  Offsets to separate different node types.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return (sorted_nodes, original_positions, rank_offsets, rank_offsets_event),
- * where the first one includes sorted nodes, the second contains original
- * positions of the sorted nodes and the third contains the offsets of the
- * sorted_nodes indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]]
- * contains nodes that belongs to the `i`th rank. Before accessing rank_offsets
- * on the CPU, `rank_offsets_event.synchronize()` is required.
+ * @return (sorted_nodes, new_positions, rank_offsets, rank_offsets_event),
+ * where the first one includes sorted nodes, the second contains new positions
+ * of the given nodes, so that sorted_nodes[new_positions] == nodes, and the
+ * third contains the offsets of the sorted_nodes indicating
+ * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that
+ * belongs to the `i`th rank. Before accessing rank_offsets on the CPU,
+ * `rank_offsets_event.synchronize()` is required.
  */
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, at::cuda::CUDAEvent>
 RankSortImpl(
@@ -72,11 +72,12 @@ RankSortImpl(
  * @param rank         Rank of the current GPU.
  * @param world_size   World size, the total number of cooperating GPUs.
  *
- * @return vector of (sorted_nodes, original_positions, rank_offsets), where the
- * first one includes sorted nodes, the second contains original positions of
- * the sorted nodes and the third contains the offsets of the sorted_nodes
- * indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes
- * that belongs to the `i`th rank.
+ * @return vector of (sorted_nodes, new_positions, rank_offsets), where the
+ * first one includes sorted nodes, the second contains new positions of the
+ * given nodes, so that sorted_nodes[new_positions] == nodes, and the third
+ * contains the offsets of the sorted_nodes indicating
+ * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that
+ * belongs to the `i`th rank.
  */
 std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
     const std::vector<torch::Tensor>& nodes_list, int64_t rank,
diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
index 3db918ee7fdb..56c4693bf5b0 100644
--- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu
+++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu
@@ -99,8 +99,12 @@ __global__ void _MapIdsBatched(
 
       auto slot = map.find(key);
       auto new_id = slot->second;
-      if (index) new_id = index[new_id];
-      mapped_ids[i] = new_id - unique_ids_offsets[batch_index];
+      if (index) {
+        new_id = index[new_id];
+      } else {
+        new_id -= unique_ids_offsets[batch_index];
+      }
+      mapped_ids[i] = new_id;
     }
 
     i += stride;
diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py
index 8040dabd286c..22c5ae316c71 100644
--- a/python/dgl/graphbolt/impl/cooperative_conv.py
+++ b/python/dgl/graphbolt/impl/cooperative_conv.py
@@ -78,10 +78,10 @@ def backward(
                 torch.split(typed_grad_output, counts_sent[ntype]),
             )
             i = out.new_empty(2, out.shape[0], dtype=torch.int64)
-            i[0] = torch.arange(
+            i[0] = seed_inverse_ids[ntype]  # src
+            i[1] = torch.arange(
                 out.shape[0], device=typed_grad_output.device
-            )  # src
-            i[1] = seed_inverse_ids[ntype]  # dst
+            )  # dst
             coo = torch.sparse_coo_tensor(
                 i,
                 torch.ones(
diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
index 1de8669b0e08..f88e011f4385 100644
--- a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
+++ b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py
@@ -18,7 +18,7 @@
 )
 @pytest.mark.parametrize("dtype", [torch.int32, torch.int64])
 @pytest.mark.parametrize("rank", list(range(WORLD_SIZE)))
-def test_gpu_cached_feature_read_async(dtype, rank):
+def test_rank_sort_and_unique_and_compact(dtype, rank):
     torch.manual_seed(7)
     nodes_list1 = [
         torch.randint(0, 2111111111, [777], dtype=dtype, device=F.ctx())
@@ -32,8 +32,8 @@ def test_gpu_cached_feature_read_async(dtype, rank):
     for i, ((nodes1, idx1, offsets1), (nodes2, idx2, offsets2)) in enumerate(
         zip(res1, res2)
     ):
-        assert_equal(nodes_list1[i], nodes1[idx1.sort()[1]])
-        assert_equal(nodes_list2[i], nodes2[idx2.sort()[1]])
+        assert_equal(nodes_list1[i], nodes1[idx1])
+        assert_equal(nodes_list2[i], nodes2[idx2])
         assert_equal(offsets1, offsets2)
         assert offsets1.is_pinned() and offsets2.is_pinned()
 
@@ -50,14 +50,12 @@ def test_gpu_cached_feature_read_async(dtype, rank):
     for (nodes1, idx1, offsets1), (nodes4, idx4, offsets4) in zip(res1, res4):
         off1 = offsets1.tolist()
         off4 = offsets4.tolist()
+        assert_equal(nodes1[idx1], nodes4[idx4])
         for i in range(WORLD_SIZE):
             j = (i - rank + WORLD_SIZE) % WORLD_SIZE
             assert_equal(
                 nodes1[off1[j] : off1[j + 1]], nodes4[off4[i] : off4[i + 1]]
             )
-            assert_equal(
-                idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]]
-            )
 
     unique, compacted, offsets = gb.unique_and_compact(
         nodes_list1[:1], rank, WORLD_SIZE

From 5ae64001f778d6da6d056f3da3e6b5349282ea00 Mon Sep 17 00:00:00 2001
From: Muhammed Fatih BALIN <m.f.balin@gmail.com>
Date: Thu, 19 Sep 2024 17:00:27 -0400
Subject: [PATCH 10/10] [GraphBolt][CUDA] `rank_sort_async` for Cooperative
 Minibatching. (#7805)

---
 .../cuda/cooperative_minibatching_utils.cu    | 11 ++++++
 .../src/cuda/cooperative_minibatching_utils.h |  7 ++++
 graphbolt/src/python_binding.cc               |  8 +++++
 python/dgl/graphbolt/subgraph_sampler.py      | 34 +++++++++++++------
 .../pytorch/graphbolt/test_dataloader.py      |  4 +--
 5 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
index e192de458650..583e58629449 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu
@@ -25,6 +25,7 @@
 #include <cub/cub.cuh>
 #include <cuda/functional>
 
+#include "../utils.h"
 #include "./common.h"
 #include "./cooperative_minibatching_utils.cuh"
 #include "./cooperative_minibatching_utils.h"
@@ -144,5 +145,15 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
   return results;
 }
 
+c10::intrusive_ptr<Future<
+    std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
+RankSortAsync(
+    const std::vector<torch::Tensor>& nodes_list, const int64_t rank,
+    const int64_t world_size) {
+  return async(
+      [=] { return RankSort(nodes_list, rank, world_size); },
+      utils::is_on_gpu(nodes_list.at(0)));
+}
+
 }  // namespace cuda
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h
index efe2b5b28bf1..c506c18d21c3 100644
--- a/graphbolt/src/cuda/cooperative_minibatching_utils.h
+++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h
@@ -22,6 +22,7 @@
 #define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_
 
 #include <ATen/cuda/CUDAEvent.h>
+#include <graphbolt/async.h>
 #include <torch/script.h>
 
 namespace graphbolt {
@@ -83,6 +84,12 @@ std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> RankSort(
     const std::vector<torch::Tensor>& nodes_list, int64_t rank,
     int64_t world_size);
 
+c10::intrusive_ptr<Future<
+    std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>
+RankSortAsync(
+    const std::vector<torch::Tensor>& nodes_list, const int64_t rank,
+    const int64_t world_size);
+
 }  // namespace cuda
 }  // namespace graphbolt
 
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 35ab345c56f9..ed5e6273e7ac 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -59,6 +59,13 @@ TORCH_LIBRARY(graphbolt, m) {
           &Future<std::vector<std::tuple<
               torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>>>::
               Wait);
+  m.class_<Future<
+      std::vector<std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>>(
+       "RankSortFuture")
+      .def(
+          "wait",
+          &Future<std::vector<
+              std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>>::Wait);
   m.class_<Future<std::tuple<torch::Tensor, torch::Tensor, int64_t, int64_t>>>(
        "GpuGraphCacheQueryFuture")
       .def(
@@ -198,6 +205,7 @@ TORCH_LIBRARY(graphbolt, m) {
 #ifdef GRAPHBOLT_USE_CUDA
   m.def("set_max_uva_threads", &cuda::set_max_uva_threads);
   m.def("rank_sort", &cuda::RankSort);
+  m.def("rank_sort_async", &cuda::RankSortAsync);
 #endif
 #ifdef HAS_IMPL_ABSTRACT_PYSTUB
   m.impl_abstract_pystub("dgl.graphbolt.base", "//dgl.graphbolt.base");
diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py
index dd5093ae5f69..88fc9c124de5 100644
--- a/python/dgl/graphbolt/subgraph_sampler.py
+++ b/python/dgl/graphbolt/subgraph_sampler.py
@@ -140,6 +140,9 @@ def __init__(
         if cooperative:
             datapipe = datapipe.transform(self._seeds_cooperative_exchange_1)
             datapipe = datapipe.buffer()
+            datapipe = datapipe.transform(
+                self._seeds_cooperative_exchange_1_wait_future
+            ).buffer()
             datapipe = datapipe.transform(self._seeds_cooperative_exchange_2)
             datapipe = datapipe.buffer()
             datapipe = datapipe.transform(self._seeds_cooperative_exchange_3)
@@ -193,19 +196,32 @@ def _wait_preprocess_future(minibatch, cooperative: bool):
         return minibatch
 
     @staticmethod
-    def _seeds_cooperative_exchange_1(minibatch, group=None):
-        rank = thd.get_rank(group)
-        world_size = thd.get_world_size(group)
+    def _seeds_cooperative_exchange_1(minibatch):
+        rank = thd.get_rank()
+        world_size = thd.get_world_size()
         seeds = minibatch._seed_nodes
         is_homogeneous = not isinstance(seeds, dict)
         if is_homogeneous:
             seeds = {"_N": seeds}
         if minibatch._seeds_offsets is None:
-            seeds_list = list(seeds.values())
-            result = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size)
             assert minibatch.compacted_seeds is None
+            minibatch._rank_sort_future = torch.ops.graphbolt.rank_sort_async(
+                list(seeds.values()), rank, world_size
+            )
+        return minibatch
+
+    @staticmethod
+    def _seeds_cooperative_exchange_1_wait_future(minibatch):
+        world_size = thd.get_world_size()
+        seeds = minibatch._seed_nodes
+        is_homogeneous = not isinstance(seeds, dict)
+        if is_homogeneous:
+            seeds = {"_N": seeds}
+        num_ntypes = len(seeds.keys())
+        if minibatch._seeds_offsets is None:
+            result = minibatch._rank_sort_future.wait()
+            delattr(minibatch, "_rank_sort_future")
             sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {}
-            num_ntypes = len(seeds.keys())
             for i, (
                 seed_type,
                 (typed_sorted_seeds, typed_index, typed_offsets),
@@ -229,7 +245,6 @@ def _seeds_cooperative_exchange_1(minibatch, group=None):
         minibatch._counts_future = all_to_all(
             counts_received.split(num_ntypes),
             counts_sent.split(num_ntypes),
-            group=group,
             async_op=True,
         )
         minibatch._counts_sent = counts_sent
@@ -237,8 +252,8 @@ def _seeds_cooperative_exchange_1(minibatch, group=None):
         return minibatch
 
     @staticmethod
-    def _seeds_cooperative_exchange_2(minibatch, group=None):
-        world_size = thd.get_world_size(group)
+    def _seeds_cooperative_exchange_2(minibatch):
+        world_size = thd.get_world_size()
         seeds = minibatch._seed_nodes
         minibatch._counts_future.wait()
         delattr(minibatch, "_counts_future")
@@ -256,7 +271,6 @@ def _seeds_cooperative_exchange_2(minibatch, group=None):
             all_to_all(
                 typed_seeds_received.split(typed_counts_received),
                 typed_seeds.split(typed_counts_sent),
-                group,
             )
             seeds_received[ntype] = typed_seeds_received
             counts_sent[ntype] = typed_counts_sent
diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py
index ee8f2b0cb9f5..5843264516fc 100644
--- a/tests/python/pytorch/graphbolt/test_dataloader.py
+++ b/tests/python/pytorch/graphbolt/test_dataloader.py
@@ -163,8 +163,8 @@ def test_gpu_sampling_DataLoader(
             if enable_feature_fetch:
                 bufferer_cnt += 1  # feature fetch has 1.
     if cooperative:
-        # _preprocess stage and each sampling layer.
-        bufferer_cnt += 3
+        # _preprocess stage.
+        bufferer_cnt += 4
     datapipe_graph = traverse_dps(dataloader)
     bufferers = find_dps(
         datapipe_graph,