From 53e70c57fbbf17c2549feb4572ebf8cd7126c987 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Wed, 11 Sep 2024 17:46:50 -0400 Subject: [PATCH 01/10] [GraphBolt][CUDA] Cooperative Minibatching initial exchange. (#7795) --- python/dgl/graphbolt/internal/sample_utils.py | 1 + python/dgl/graphbolt/subgraph_sampler.py | 193 ++++++++++++++++-- .../graphbolt/impl/test_neighbor_sampler.py | 4 +- 3 files changed, 184 insertions(+), 14 deletions(-) diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py index e88ea0193a55..f499694d6d72 100644 --- a/python/dgl/graphbolt/internal/sample_utils.py +++ b/python/dgl/graphbolt/internal/sample_utils.py @@ -349,6 +349,7 @@ def wait(self): if is_homogeneous: compacted_csc_formats = list(compacted_csc_formats.values())[0] unique_nodes = list(unique_nodes.values())[0] + offsets = list(offsets.values())[0] return unique_nodes, compacted_csc_formats, offsets diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py index e26ec3336be6..88fdd3808714 100644 --- a/python/dgl/graphbolt/subgraph_sampler.py +++ b/python/dgl/graphbolt/subgraph_sampler.py @@ -5,10 +5,12 @@ from typing import Dict import torch +import torch.distributed as thd from torch.utils.data import functional_datapipe from .base import seed_type_str_to_ntypes from .internal import compact_temporal_nodes, unique_and_compact +from .minibatch import MiniBatch from .minibatch_transformer import MiniBatchTransformer __all__ = [ @@ -28,6 +30,25 @@ def wait(self): return result +def _shift(inputs: list, group=None): + cutoff = len(inputs) - thd.get_rank(group) + return inputs[cutoff:] + inputs[:cutoff] + + +def all_to_all(outputs, inputs, group=None, async_op=False): + """Wrapper for thd.all_to_all that permuted outputs and inputs before + calling it. The arguments have the permutation + `rank, ..., world_size - 1, 0, ..., rank - 1` and we make it + `0, world_size - 1` before calling `thd.all_to_all`.""" + shift_fn = partial(_shift, group=group) + return thd.all_to_all(shift_fn(outputs), shift_fn(inputs), group, async_op) + + +def _revert_to_homo(d: dict): + is_homogenous = len(d) == 1 and "_N" in d + return list(d.values())[0] if is_homogenous else d + + @functional_datapipe("sample_subgraph") class SubgraphSampler(MiniBatchTransformer): """A subgraph sampler used to sample a subgraph from a given set of nodes @@ -49,8 +70,8 @@ class SubgraphSampler(MiniBatchTransformer): Arguments to be passed into sampling_stages. kwargs : Keyword Arguments Arguments to be passed into sampling_stages. Preprocessing stage makes - use of the `asynchronous` parameter before it is passed to - the sampling stages. + use of the `asynchronous` and `cooperative` parameters before they are + passed to the sampling stages. """ def __init__( @@ -60,10 +81,22 @@ def __init__( **kwargs, ): async_op = kwargs.get("asynchronous", False) - preprocess_fn = partial(self._preprocess, async_op=async_op) + cooperative = kwargs.get("cooperative", False) + preprocess_fn = partial( + self._preprocess, cooperative=cooperative, async_op=async_op + ) datapipe = datapipe.transform(preprocess_fn) if async_op: - datapipe = datapipe.buffer().transform(self._wait_preprocess_future) + fn = partial(self._wait_preprocess_future, cooperative=cooperative) + datapipe = datapipe.buffer().transform(fn) + if cooperative: + datapipe = datapipe.transform(self._seeds_cooperative_exchange_1) + datapipe = datapipe.buffer() + datapipe = datapipe.transform(self._seeds_cooperative_exchange_2) + datapipe = datapipe.buffer() + datapipe = datapipe.transform(self._seeds_cooperative_exchange_3) + datapipe = datapipe.buffer() + datapipe = datapipe.transform(self._seeds_cooperative_exchange_4) datapipe = self.sampling_stages(datapipe, *args, **kwargs) datapipe = datapipe.transform(self._postprocess) super().__init__(datapipe) @@ -75,12 +108,16 @@ def _postprocess(minibatch): return minibatch @staticmethod - def _preprocess(minibatch, async_op: bool): + def _preprocess(minibatch, cooperative: bool, async_op: bool): if minibatch.seeds is None: raise ValueError( f"Invalid minibatch {minibatch}: `seeds` should have a value." ) - results = SubgraphSampler._seeds_preprocess(minibatch, async_op) + rank = thd.get_rank() if cooperative else 0 + world_size = thd.get_world_size() if cooperative else 1 + results = SubgraphSampler._seeds_preprocess( + minibatch, rank, world_size, async_op + ) if async_op: minibatch._preprocess_future = results else: @@ -88,17 +125,125 @@ def _preprocess(minibatch, async_op: bool): minibatch._seed_nodes, minibatch._seeds_timestamp, minibatch.compacted_seeds, + offsets, ) = results + if cooperative: + minibatch._seeds_offsets = offsets return minibatch @staticmethod - def _wait_preprocess_future(minibatch): + def _wait_preprocess_future(minibatch, cooperative: bool): ( minibatch._seed_nodes, minibatch._seeds_timestamp, minibatch.compacted_seeds, + offsets, ) = minibatch._preprocess_future.wait() delattr(minibatch, "_preprocess_future") + if cooperative: + minibatch._seeds_offsets = offsets + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_1(minibatch, group=None): + rank = thd.get_rank(group) + world_size = thd.get_world_size(group) + assert world_size > 1 + seeds = minibatch._seed_nodes + is_homogeneous = not isinstance(seeds, dict) + if is_homogeneous: + seeds = {"_N": seeds} + if minibatch._seeds_offsets is None: + seeds_list = list(seeds.values()) + ( + sorted_seeds_list, + index_list, + offsets_list, + ) = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size) + assert minibatch.compacted_seeds is None + sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {} + num_ntypes = len(seeds.keys()) + for i, ( + seed_type, + typed_sorted_seeds, + typed_index, + typed_offsets, + ) in enumerate( + zip( + seeds.keys(), + sorted_seeds_list, + index_list, + offsets_list, + ) + ): + sorted_seeds[seed_type] = typed_sorted_seeds + sorted_compacted[seed_type] = typed_index + sorted_offsets[seed_type] = typed_offsets.tolist() + + minibatch._seed_nodes = sorted_seeds + minibatch.compacted_seeds = sorted_compacted + minibatch._seeds_offsets = sorted_offsets + else: + minibatch._seeds_offsets = {"_N": minibatch._seeds_offsets} + counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64) + for i, offsets in enumerate(minibatch._seeds_offsets[0].values()): + counts_sent[ + torch.arange(i, world_size * num_ntypes, num_ntypes) + ] = offsets.diff() + delattr(minibatch, "_seeds_offsets") + counts_received = torch.empty_like(counts_sent) + minibatch._counts_future = all_to_all( + counts_received.split(num_ntypes), + counts_sent.split(num_ntypes), + group=group, + async_op=True, + ) + minibatch._counts_sent = counts_sent + minibatch._counts_received = counts_received + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_2(minibatch, group=None): + world_size = thd.get_world_size(group) + seeds = minibatch._seed_nodes + minibatch._counts_future.wait() + delattr(minibatch, "_counts_future") + counts_received = minibatch._counts_received + num_ntypes = len(seeds.keys()) + seeds_received = {} + counts_sent = {} + counts_received = {} + for i, (ntype, typed_seeds) in enumerate(seeds.items()): + idx = torch.arange(i, world_size * num_ntypes, num_ntypes) + typed_counts_sent = minibatch._counts_sent[idx].tolist() + typed_counts_received = minibatch._counts_received[idx].tolist() + typed_seeds_received = typed_seeds.new_empty( + sum(typed_counts_received) + ) + all_to_all( + typed_seeds_received.split(typed_counts_received), + typed_seeds.split(typed_counts_sent), + group, + ) + seeds_received[ntype] = typed_seeds_received + minibatch._seed_nodes = _revert_to_homo(seeds_received) + minibatch._counts_sent = _revert_to_homo(counts_sent) + minibatch._counts_received = _revert_to_homo(counts_received) + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_3(minibatch): + minibatch._unique_future = unique_and_compact( + minibatch._seed_nodes, 0, 1, async_op=True + ) + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_4(minibatch): + unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait() + delattr(minibatch, "_unique_future") + minibatch._seed_nodes = _revert_to_homo(unique_seeds) + minibatch._seed_inverse_ids = _revert_to_homo(inverse_seeds) return minibatch def _sample(self, minibatch): @@ -119,7 +264,12 @@ def sampling_stages(self, datapipe): return datapipe.transform(self._sample) @staticmethod - def _seeds_preprocess(minibatch, async_op): + def _seeds_preprocess( + minibatch: MiniBatch, + rank: int = 0, + world_size: int = 1, + async_op: bool = False, + ): """Preprocess `seeds` in a minibatch to construct `unique_seeds`, `node_timestamp` and `compacted_seeds` for further sampling. It optionally incorporates timestamps for temporal graphs, organizing and @@ -130,6 +280,11 @@ def _seeds_preprocess(minibatch, async_op): ---------- minibatch: MiniBatch The minibatch. + rank : int + The rank of the current process among cooperating processes. + world_size : int + The number of cooperating + (`arXiv:2210.13339`__) processes. async_op: bool Boolean indicating whether the call is asynchronous. If so, the result can be obtained by calling wait on the returned future. @@ -145,8 +300,16 @@ def _seeds_preprocess(minibatch, async_op): compacted_seeds: torch.tensor or a Dict[str, torch.Tensor] Representation of compacted seeds corresponding to 'seeds', where all node ids inside are compacted. + offsets: None or torch.Tensor or Dict[src, torch.Tensor] + The unique nodes offsets tensor partitions the unique_nodes tensor. + Has size `world_size + 1` and + `unique_nodes[offsets[i]: offsets[i + 1]]` belongs to the rank + `(rank + i) % world_size`. """ use_timestamp = hasattr(minibatch, "timestamp") + assert ( + not use_timestamp or world_size == 1 + ), "Temporal code path does not currently support Cooperative Minibatching" seeds = minibatch.seeds is_heterogeneous = isinstance(seeds, Dict) if is_heterogeneous: @@ -164,7 +327,7 @@ def _seeds_preprocess(minibatch, async_op): if hasattr(minibatch, "timestamp") else None ) - result = _NoOpWaiter((seeds, nodes_timestamp, None)) + result = _NoOpWaiter((seeds, nodes_timestamp, None, None)) break result = None assert typed_seeds.ndim == 2, ( @@ -200,7 +363,7 @@ def __init__(self, nodes, nodes_timestamp, seeds): ) else: self.future = unique_and_compact( - nodes, async_op=async_op + nodes, rank, world_size, async_op ) self.seeds = seeds @@ -208,8 +371,9 @@ def wait(self): """Returns the stored value when invoked.""" if use_timestamp: unique_seeds, nodes_timestamp, compacted = self.future + offsets = None else: - unique_seeds, compacted, _ = ( + unique_seeds, compacted, offsets = ( self.future.wait() if async_op else self.future ) nodes_timestamp = None @@ -234,6 +398,7 @@ def wait(self): unique_seeds, nodes_timestamp, compacted_seeds, + offsets, ) # When typed_seeds is not a one-dimensional tensor @@ -248,7 +413,7 @@ def wait(self): if hasattr(minibatch, "timestamp") else None ) - result = _NoOpWaiter((seeds, nodes_timestamp, None)) + result = _NoOpWaiter((seeds, nodes_timestamp, None, None)) else: # Collect nodes from all types of input. nodes = [seeds.view(-1)] @@ -289,8 +454,9 @@ def wait(self): nodes_timestamp, compacted, ) = self.future + offsets = None else: - unique_seeds, compacted, _ = ( + unique_seeds, compacted, offsets = ( self.future.wait() if async_op else self.future ) nodes_timestamp = None @@ -305,6 +471,7 @@ def wait(self): unique_seeds, nodes_timestamp, compacted_seeds, + offsets, ) result = _Waiter(nodes, nodes_timestamp, seeds) diff --git a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py index 4a095ac1b10d..5326e620b0c3 100644 --- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py +++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py @@ -65,7 +65,9 @@ def test_NeighborSampler_GraphFetch( graph.type_per_edge = None item_sampler = gb.ItemSampler(itemset, batch_size=2).copy_to(F.ctx()) fanout = torch.LongTensor([2]) - preprocess_fn = partial(gb.SubgraphSampler._preprocess, async_op=False) + preprocess_fn = partial( + gb.SubgraphSampler._preprocess, cooperative=False, async_op=False + ) datapipe = item_sampler.map(preprocess_fn) datapipe = datapipe.map( partial(gb.NeighborSampler._prepare, graph.node_type_to_id) From 165e2507e78bb50760d6a9e16cf2dc31b986c27a Mon Sep 17 00:00:00 2001 From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:55:33 +0800 Subject: [PATCH 02/10] [DistGB] modify partition.py to save FusedCSCSamplingGraph directly (#7727) Co-authored-by: Ubuntu Co-authored-by: Ubuntu Co-authored-by: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> --- python/dgl/distributed/partition.py | 535 +++++++++++++++++++--------- 1 file changed, 371 insertions(+), 164 deletions(-) diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py index 08343df5505a..079ed8806a96 100644 --- a/python/dgl/distributed/partition.py +++ b/python/dgl/distributed/partition.py @@ -88,24 +88,26 @@ def _dump_part_config(part_config, part_metadata): json.dump(part_metadata, outfile, sort_keys=False, indent=4) -def _save_graphs(filename, g_list, formats=None, sort_etypes=False): +def process_partitions(g, formats=None, sort_etypes=False): """Preprocess partitions before saving: 1. format data types. 2. sort csc/csr by tag. """ - for g in g_list: - for k, dtype in RESERVED_FIELD_DTYPE.items(): - if k in g.ndata: - g.ndata[k] = F.astype(g.ndata[k], dtype) - if k in g.edata: - g.edata[k] = F.astype(g.edata[k], dtype) - for g in g_list: - if (not sort_etypes) or (formats is None): - continue + for k, dtype in RESERVED_FIELD_DTYPE.items(): + if k in g.ndata: + g.ndata[k] = F.astype(g.ndata[k], dtype) + if k in g.edata: + g.edata[k] = F.astype(g.edata[k], dtype) + + if (sort_etypes) and (formats is not None): if "csr" in formats: g = sort_csr_by_tag(g, tag=g.edata[ETYPE], tag_type="edge") if "csc" in formats: g = sort_csc_by_tag(g, tag=g.edata[ETYPE], tag_type="edge") + return g + + +def _save_dgl_graphs(filename, g_list, formats=None): save_graphs(filename, g_list, formats=formats) @@ -332,9 +334,10 @@ def load_partition(part_config, part_id, load_feats=True, use_graphbolt=False): "part-{}".format(part_id) in part_metadata ), "part-{} does not exist".format(part_id) part_files = part_metadata["part-{}".format(part_id)] - part_graph_field = "part_graph" if use_graphbolt: part_graph_field = "part_graph_graphbolt" + else: + part_graph_field = "part_graph" assert ( part_graph_field in part_files ), f"the partition does not contain graph structure: {part_graph_field}" @@ -461,7 +464,7 @@ def load_partition_feats( return node_feats, edge_feats -def load_partition_book(part_config, part_id): +def load_partition_book(part_config, part_id, part_metadata=None): """Load a graph partition book from the partition config file. Parameters @@ -470,6 +473,8 @@ def load_partition_book(part_config, part_id): The path of the partition config file. part_id : int The partition ID. + part_metadata : dict + The meta data of partition. Returns ------- @@ -482,7 +487,8 @@ def load_partition_book(part_config, part_id): dict The edge types """ - part_metadata = _load_part_config(part_config) + if part_metadata is None: + part_metadata = _load_part_config(part_config) assert "num_parts" in part_metadata, "num_parts does not exist." assert ( part_metadata["num_parts"] > part_id @@ -666,6 +672,38 @@ def _set_trainer_ids(g, sim_g, node_parts): g.edges[c_etype].data["trainer_id"] = trainer_id +def _partition_to_graphbolt( + parts, + part_i, + part_config, + part_metadata, + *, + store_eids=True, + store_inner_node=False, + store_inner_edge=False, + graph_formats=None, +): + gpb, _, ntypes, etypes = load_partition_book( + part_config=part_config, part_id=part_i, part_metadata=part_metadata + ) + graph = parts[part_i] + csc_graph = gb_convert_single_dgl_partition( + ntypes=ntypes, + etypes=etypes, + gpb=gpb, + part_meta=part_metadata, + graph=graph, + store_eids=store_eids, + store_inner_edge=store_inner_edge, + store_inner_node=store_inner_node, + graph_formats=graph_formats, + ) + rel_path_result = _save_graph_gb( + part_config=part_config, part_id=part_i, csc_graph=csc_graph + ) + part_metadata[f"part-{part_i}"]["part_graph_graphbolt"] = rel_path_result + + def _update_node_edge_map(node_map_val, edge_map_val, g, num_parts): """ If the original graph contains few nodes or edges for specific node/edge @@ -1303,6 +1341,7 @@ def get_homogeneous(g, balance_ntypes): "ntypes": ntypes, "etypes": etypes, } + part_config = os.path.join(out_path, graph_name + ".json") for part_id in range(num_parts): part = parts[part_id] @@ -1425,30 +1464,54 @@ def get_homogeneous(g, balance_ntypes): part_dir = os.path.join(out_path, "part" + str(part_id)) node_feat_file = os.path.join(part_dir, "node_feat.dgl") edge_feat_file = os.path.join(part_dir, "edge_feat.dgl") - part_graph_file = os.path.join(part_dir, "graph.dgl") - part_metadata["part-{}".format(part_id)] = { - "node_feats": os.path.relpath(node_feat_file, out_path), - "edge_feats": os.path.relpath(edge_feat_file, out_path), - "part_graph": os.path.relpath(part_graph_file, out_path), - } + os.makedirs(part_dir, mode=0o775, exist_ok=True) save_tensors(node_feat_file, node_feats) save_tensors(edge_feat_file, edge_feats) + part_metadata["part-{}".format(part_id)] = { + "node_feats": os.path.relpath(node_feat_file, out_path), + "edge_feats": os.path.relpath(edge_feat_file, out_path), + } sort_etypes = len(g.etypes) > 1 - _save_graphs( - part_graph_file, - [part], - formats=graph_formats, - sort_etypes=sort_etypes, - ) - print( - "Save partitions: {:.3f} seconds, peak memory: {:.3f} GB".format( - time.time() - start, get_peak_mem() - ) - ) + part = process_partitions(part, graph_formats, sort_etypes) + + # transmit to graphbolt and save graph + if use_graphbolt: + # save FusedCSCSamplingGraph + kwargs["graph_formats"] = graph_formats + n_jobs = kwargs.pop("n_jobs", 1) + mp_ctx = mp.get_context("spawn") + with concurrent.futures.ProcessPoolExecutor( # pylint: disable=unexpected-keyword-arg + max_workers=min(num_parts, n_jobs), + mp_context=mp_ctx, + ) as executor: + for part_id in range(num_parts): + executor.submit( + _partition_to_graphbolt( + part_i=part_id, + part_config=part_config, + part_metadata=part_metadata, + parts=parts, + **kwargs, + ) + ) + part_metadata["node_map_dtype"] = "int64" + part_metadata["edge_map_dtype"] = "int64" + else: + for part_id, part in parts.items(): + part_dir = os.path.join(out_path, "part" + str(part_id)) + part_graph_file = os.path.join(part_dir, "graph.dgl") + part_metadata["part-{}".format(part_id)][ + "part_graph" + ] = os.path.relpath(part_graph_file, out_path) + # save DGLGraph + _save_dgl_graphs( + part_graph_file, + [part], + formats=graph_formats, + ) - part_config = os.path.join(out_path, graph_name + ".json") _dump_part_config(part_config, part_metadata) num_cuts = sim_g.num_edges() - tot_num_inner_edges @@ -1460,12 +1523,11 @@ def get_homogeneous(g, balance_ntypes): ) ) - if use_graphbolt: - kwargs["graph_formats"] = graph_formats - dgl_partition_to_graphbolt( - part_config, - **kwargs, + print( + "Save partitions: {:.3f} seconds, peak memory: {:.3f} GB".format( + time.time() - start, get_peak_mem() ) + ) if return_mapping: return orig_nids, orig_eids @@ -1513,20 +1575,142 @@ def init_type_per_edge(graph, gpb): return etype_ids -def gb_convert_single_dgl_partition( - part_id, - graph_formats, - part_config, - store_eids, +def _load_part(part_config, part_id, parts=None): + """load parts from variable or dist.""" + if parts is None: + graph, _, _, _, _, _, _ = load_partition( + part_config, part_id, load_feats=False + ) + else: + graph = parts[part_id] + return graph + + +def _save_graph_gb(part_config, part_id, csc_graph): + csc_graph_save_dir = os.path.join( + os.path.dirname(part_config), + f"part{part_id}", + ) + csc_graph_path = os.path.join( + csc_graph_save_dir, "fused_csc_sampling_graph.pt" + ) + torch.save(csc_graph, csc_graph_path) + + return os.path.relpath(csc_graph_path, os.path.dirname(part_config)) + + +def cast_various_to_minimum_dtype_gb( + graph, + part_meta, + num_parts, + indptr, + indices, + type_per_edge, + etypes, + ntypes, + node_attributes, + edge_attributes, +): + """Cast various data to minimum dtype.""" + # Cast 1: indptr. + indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr) + # Cast 2: indices. + indices = _cast_to_minimum_dtype(graph.num_nodes(), indices) + # Cast 3: type_per_edge. + type_per_edge = _cast_to_minimum_dtype( + len(etypes), type_per_edge, field=ETYPE + ) + # Cast 4: node/edge_attributes. + predicates = { + NID: part_meta["num_nodes"], + "part_id": num_parts, + NTYPE: len(ntypes), + EID: part_meta["num_edges"], + ETYPE: len(etypes), + DGL2GB_EID: part_meta["num_edges"], + GB_DST_ID: part_meta["num_nodes"], + } + for attributes in [node_attributes, edge_attributes]: + for key in attributes: + if key not in predicates: + continue + attributes[key] = _cast_to_minimum_dtype( + predicates[key], attributes[key], field=key + ) + return indptr, indices, type_per_edge + + +def _create_attributes_gb( + graph, + gpb, + edge_ids, + is_homo, store_inner_node, store_inner_edge, + store_eids, + debug_mode, +): + # Save node attributes. Detailed attributes are shown below. + # DGL_GB\Attributes dgl.NID("_ID") dgl.NTYPE("_TYPE") "inner_node" "part_id" + # DGL_Homograph ✅ 🚫 ✅ ✅ + # GB_Homograph ✅ 🚫 optional 🚫 + # DGL_Heterograph ✅ ✅ ✅ ✅ + # GB_Heterograph ✅ 🚫 optional 🚫 + required_node_attrs = [NID] + if store_inner_node: + required_node_attrs.append("inner_node") + if debug_mode: + required_node_attrs = list(graph.ndata.keys()) + node_attributes = {attr: graph.ndata[attr] for attr in required_node_attrs} + + # Save edge attributes. Detailed attributes are shown below. + # DGL_GB\Attributes dgl.EID("_ID") dgl.ETYPE("_TYPE") "inner_edge" + # DGL_Homograph ✅ 🚫 ✅ + # GB_Homograph optional 🚫 optional + # DGL_Heterograph ✅ ✅ ✅ + # GB_Heterograph optional ✅ optional + type_per_edge = None + if not is_homo: + type_per_edge = init_type_per_edge(graph, gpb)[edge_ids] + type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE]) + required_edge_attrs = [] + if store_eids: + required_edge_attrs.append(EID) + if store_inner_edge: + required_edge_attrs.append("inner_edge") + if debug_mode: + required_edge_attrs = list(graph.edata.keys()) + edge_attributes = { + attr: graph.edata[attr][edge_ids] for attr in required_edge_attrs + } + return node_attributes, edge_attributes, type_per_edge + + +def gb_convert_single_dgl_partition( + ntypes, + etypes, + gpb, + part_meta, + graph, + graph_formats=None, + store_eids=False, + store_inner_node=False, + store_inner_edge=False, ): """Converts a single DGL partition to GraphBolt. Parameters ---------- - part_id : int - The numerical ID of the partition to convert. + node types : dict + The node types + edge types : dict + The edge types + gpb : GraphPartitionBook + The global partition information. + part_meta : dict + Contain the meta data of the partition. + graph : DGLGraph + The graph to be converted to graphbolt graph. graph_formats : str or list[str], optional Save partitions in specified formats. It could be any combination of `coo`, `csc`. As `csc` format is mandatory for `FusedCSCSamplingGraph`, @@ -1547,14 +1731,8 @@ def gb_convert_single_dgl_partition( "Running in debug mode which means all attributes of DGL partitions" " will be saved to the new format." ) - - part_meta = _load_part_config(part_config) num_parts = part_meta["num_parts"] - graph, _, _, gpb, _, _, _ = load_partition( - part_config, part_id, load_feats=False - ) - _, _, ntypes, etypes = load_partition_book(part_config, part_id) is_homo = is_homogeneous(ntypes, etypes) node_type_to_id = ( None if is_homo else {ntype: ntid for ntid, ntype in enumerate(ntypes)} @@ -1569,39 +1747,16 @@ def gb_convert_single_dgl_partition( # Obtain CSC indtpr and indices. indptr, indices, edge_ids = graph.adj_tensors("csc") - # Save node attributes. Detailed attributes are shown below. - # DGL_GB\Attributes dgl.NID("_ID") dgl.NTYPE("_TYPE") "inner_node" "part_id" - # DGL_Homograph ✅ 🚫 ✅ ✅ - # GB_Homograph ✅ 🚫 optional 🚫 - # DGL_Heterograph ✅ ✅ ✅ ✅ - # GB_Heterograph ✅ 🚫 optional 🚫 - required_node_attrs = [NID] - if store_inner_node: - required_node_attrs.append("inner_node") - if debug_mode: - required_node_attrs = list(graph.ndata.keys()) - node_attributes = {attr: graph.ndata[attr] for attr in required_node_attrs} - - # Save edge attributes. Detailed attributes are shown below. - # DGL_GB\Attributes dgl.EID("_ID") dgl.ETYPE("_TYPE") "inner_edge" - # DGL_Homograph ✅ 🚫 ✅ - # GB_Homograph optional 🚫 optional - # DGL_Heterograph ✅ ✅ ✅ - # GB_Heterograph optional ✅ optional - type_per_edge = None - if not is_homo: - type_per_edge = init_type_per_edge(graph, gpb)[edge_ids] - type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE]) - required_edge_attrs = [] - if store_eids: - required_edge_attrs.append(EID) - if store_inner_edge: - required_edge_attrs.append("inner_edge") - if debug_mode: - required_edge_attrs = list(graph.edata.keys()) - edge_attributes = { - attr: graph.edata[attr][edge_ids] for attr in required_edge_attrs - } + node_attributes, edge_attributes, type_per_edge = _create_attributes_gb( + graph, + gpb, + edge_ids, + is_homo, + store_inner_node, + store_inner_edge, + store_eids, + debug_mode, + ) # When converting DGLGraph to FusedCSCSamplingGraph, edge IDs are # re-ordered(actually FusedCSCSamplingGraph does not have edge IDs # in nature). So we need to save such re-order info for any @@ -1623,32 +1778,18 @@ def gb_convert_single_dgl_partition( indptr, dtype=indices.dtype ) - # Cast various data to minimum dtype. - # Cast 1: indptr. - indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr) - # Cast 2: indices. - indices = _cast_to_minimum_dtype(graph.num_nodes(), indices) - # Cast 3: type_per_edge. - type_per_edge = _cast_to_minimum_dtype( - len(etypes), type_per_edge, field=ETYPE + indptr, indices, type_per_edge = cast_various_to_minimum_dtype_gb( + graph, + part_meta, + num_parts, + indptr, + indices, + type_per_edge, + etypes, + ntypes, + node_attributes, + edge_attributes, ) - # Cast 4: node/edge_attributes. - predicates = { - NID: part_meta["num_nodes"], - "part_id": num_parts, - NTYPE: len(ntypes), - EID: part_meta["num_edges"], - ETYPE: len(etypes), - DGL2GB_EID: part_meta["num_edges"], - GB_DST_ID: part_meta["num_nodes"], - } - for attributes in [node_attributes, edge_attributes]: - for key in attributes: - if key not in predicates: - continue - attributes[key] = _cast_to_minimum_dtype( - predicates[key], attributes[key], field=key - ) csc_graph = gb.fused_csc_sampling_graph( indptr, @@ -1660,17 +1801,128 @@ def gb_convert_single_dgl_partition( node_type_to_id=node_type_to_id, edge_type_to_id=edge_type_to_id, ) - orig_graph_path = os.path.join( - os.path.dirname(part_config), - part_meta[f"part-{part_id}"]["part_graph"], + return csc_graph + + +def _convert_partition_to_graphbolt( + part_config, + part_id, + graph_formats=None, + store_eids=False, + store_inner_node=False, + store_inner_edge=False, +): + """ + The pipeline converting signle partition to graphbolt. + + Parameters + ---------- + part_config : str + The path of the partition config file. + part_id : int + The partition ID. + graph_formats : str or list[str], optional + Save partitions in specified formats. It could be any combination of + `coo`, `csc`. As `csc` format is mandatory for `FusedCSCSamplingGraph`, + it is not necessary to specify this argument. It's mainly for + specifying `coo` format to save edge ID mapping and destination node + IDs. If not specified, whether to save `coo` format is determined by + the availability of the format in DGL partitions. Default: None. + store_eids : bool, optional + Whether to store edge IDs in the new graph. Default: True. + store_inner_node : bool, optional + Whether to store inner node mask in the new graph. Default: False. + store_inner_edge : bool, optional + Whether to store inner edge mask in the new graph. Default: False. + + Returns + ------- + str + The path csc_graph to save. + """ + gpb, _, ntypes, etypes = load_partition_book( + part_config=part_config, part_id=part_id ) - csc_graph_path = os.path.join( - os.path.dirname(orig_graph_path), "fused_csc_sampling_graph.pt" + part = _load_part(part_config, part_id) + part_meta = copy.deepcopy(_load_part_config(part_config)) + csc_graph = gb_convert_single_dgl_partition( + graph=part, + ntypes=ntypes, + etypes=etypes, + gpb=gpb, + part_meta=part_meta, + graph_formats=graph_formats, + store_eids=store_eids, + store_inner_node=store_inner_node, + store_inner_edge=store_inner_edge, ) - torch.save(csc_graph, csc_graph_path) + rel_path = _save_graph_gb(part_config, part_id, csc_graph) + return rel_path - return os.path.relpath(csc_graph_path, os.path.dirname(part_config)) - # Update graph path. + +def _convert_partition_to_graphbolt_wrapper( + graph_formats, + part_config, + store_eids, + store_inner_node, + store_inner_edge, + n_jobs, + num_parts, +): + # [Rui] DGL partitions are always saved as homogeneous graphs even though + # the original graph is heterogeneous. But heterogeneous information like + # node/edge types are saved as node/edge data alongside with partitions. + # What needs more attention is that due to the existence of HALO nodes in + # each partition, the local node IDs are not sorted according to the node + # types. So we fail to assign ``node_type_offset`` as required by GraphBolt. + # But this is not a problem since such information is not used in sampling. + # We can simply pass None to it. + + # Iterate over partitions. + convert_with_format = partial( + _convert_partition_to_graphbolt, + part_config=part_config, + graph_formats=graph_formats, + store_eids=store_eids, + store_inner_node=store_inner_node, + store_inner_edge=store_inner_edge, + ) + # Need to create entirely new interpreters, because we call C++ downstream + # See https://docs.python.org/3.12/library/multiprocessing.html#contexts-and-start-methods + # and https://pybind11.readthedocs.io/en/stable/advanced/misc.html#global-interpreter-lock-gil + rel_path_results = [] + if n_jobs > 1 and num_parts > 1: + mp_ctx = mp.get_context("spawn") + with concurrent.futures.ProcessPoolExecutor( # pylint: disable=unexpected-keyword-arg + max_workers=min(num_parts, n_jobs), + mp_context=mp_ctx, + ) as executor: + for part_id in range(num_parts): + rel_path_results.append( + executor.submit( + convert_with_format, part_id=part_id + ).result() + ) + + else: + # If running single-threaded, avoid spawning new interpreter, which is slow + for part_id in range(num_parts): + rel_path = convert_with_format(part_id=part_id) + rel_path_results.append(rel_path) + part_meta = _load_part_config(part_config) + for part_id in range(num_parts): + # Update graph path. + part_meta[f"part-{part_id}"]["part_graph_graphbolt"] = rel_path_results[ + part_id + ] + + # Save dtype info into partition config. + # [TODO][Rui] Always use int64_t for node/edge IDs in GraphBolt. See more + # details in #7175. + part_meta["node_map_dtype"] = "int64" + part_meta["edge_map_dtype"] = "int64" + + return part_meta def dgl_partition_to_graphbolt( @@ -1719,59 +1971,14 @@ def dgl_partition_to_graphbolt( " will be saved to the new format." ) part_meta = _load_part_config(part_config) - new_part_meta = copy.deepcopy(part_meta) num_parts = part_meta["num_parts"] - - # [Rui] DGL partitions are always saved as homogeneous graphs even though - # the original graph is heterogeneous. But heterogeneous information like - # node/edge types are saved as node/edge data alongside with partitions. - # What needs more attention is that due to the existence of HALO nodes in - # each partition, the local node IDs are not sorted according to the node - # types. So we fail to assign ``node_type_offset`` as required by GraphBolt. - # But this is not a problem since such information is not used in sampling. - # We can simply pass None to it. - - # Iterate over partitions. - convert_with_format = partial( - gb_convert_single_dgl_partition, + part_meta = _convert_partition_to_graphbolt_wrapper( graph_formats=graph_formats, part_config=part_config, store_eids=store_eids, store_inner_node=store_inner_node, store_inner_edge=store_inner_edge, + n_jobs=n_jobs, + num_parts=num_parts, ) - # Need to create entirely new interpreters, because we call C++ downstream - # See https://docs.python.org/3.12/library/multiprocessing.html#contexts-and-start-methods - # and https://pybind11.readthedocs.io/en/stable/advanced/misc.html#global-interpreter-lock-gil - rel_path_results = [] - if n_jobs > 1 and num_parts > 1: - mp_ctx = mp.get_context("spawn") - with concurrent.futures.ProcessPoolExecutor( # pylint: disable=unexpected-keyword-arg - max_workers=min(num_parts, n_jobs), - mp_context=mp_ctx, - ) as executor: - futures = [] - for part_id in range(num_parts): - futures.append(executor.submit(convert_with_format, part_id)) - - for part_id in range(num_parts): - rel_path_results.append(futures[part_id].result()) - else: - # If running single-threaded, avoid spawning new interpreter, which is slow - for part_id in range(num_parts): - rel_path_results.append(convert_with_format(part_id)) - - for part_id in range(num_parts): - # Update graph path. - new_part_meta[f"part-{part_id}"][ - "part_graph_graphbolt" - ] = rel_path_results[part_id] - - # Save dtype info into partition config. - # [TODO][Rui] Always use int64_t for node/edge IDs in GraphBolt. See more - # details in #7175. - new_part_meta["node_map_dtype"] = "int64" - new_part_meta["edge_map_dtype"] = "int64" - - _dump_part_config(part_config, new_part_meta) - print(f"Converted partitions to GraphBolt format into {part_config}") + _dump_part_config(part_config, part_meta) From 189b83c28c702aecce512629d4c1d6d4a538f032 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Thu, 12 Sep 2024 17:12:31 -0400 Subject: [PATCH 03/10] [GraphBolt][CUDA] Get `world_size=1` somewhat for cooperative sampling. (#7796) --- python/dgl/graphbolt/impl/neighbor_sampler.py | 134 ++++++++++++++++-- python/dgl/graphbolt/subgraph_sampler.py | 94 ++++++++---- .../pytorch/graphbolt/test_dataloader.py | 27 +++- 3 files changed, 214 insertions(+), 41 deletions(-) diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py index 6ba83941bbde..352dedc067f2 100644 --- a/python/dgl/graphbolt/impl/neighbor_sampler.py +++ b/python/dgl/graphbolt/impl/neighbor_sampler.py @@ -3,6 +3,7 @@ from functools import partial import torch +import torch.distributed as thd from torch.utils.data import functional_datapipe from torch.utils.data.datapipes.iter import Mapper @@ -12,10 +13,14 @@ index_select, ORIGINAL_EDGE_ID, ) -from ..internal import compact_csc_format, unique_and_compact_csc_formats +from ..internal import ( + compact_csc_format, + unique_and_compact, + unique_and_compact_csc_formats, +) from ..minibatch_transformer import MiniBatchTransformer -from ..subgraph_sampler import SubgraphSampler +from ..subgraph_sampler import all_to_all, revert_to_homo, SubgraphSampler from .fused_csc_sampling_graph import fused_csc_sampling_graph from .sampled_subgraph_impl import SampledSubgraphImpl @@ -455,12 +460,32 @@ def _subtract_hetero_indices_offset( class CompactPerLayer(MiniBatchTransformer): """Compact the sampled edges for a single layer.""" - def __init__(self, datapipe, deduplicate, asynchronous=False): + def __init__( + self, datapipe, deduplicate, cooperative=False, asynchronous=False + ): self.deduplicate = deduplicate + self.cooperative = cooperative if asynchronous and deduplicate: datapipe = datapipe.transform(self._compact_per_layer_async) datapipe = datapipe.buffer() - super().__init__(datapipe, self._compact_per_layer_wait_future) + datapipe = datapipe.transform(self._compact_per_layer_wait_future) + if cooperative: + datapipe = datapipe.transform( + self._seeds_cooperative_exchange_1 + ) + datapipe = datapipe.buffer() + datapipe = datapipe.transform( + self._seeds_cooperative_exchange_2 + ) + datapipe = datapipe.buffer() + datapipe = datapipe.transform( + self._seeds_cooperative_exchange_3 + ) + datapipe = datapipe.buffer() + datapipe = datapipe.transform( + self._seeds_cooperative_exchange_4 + ) + super().__init__(datapipe) else: super().__init__(datapipe, self._compact_per_layer) @@ -498,19 +523,20 @@ def _compact_per_layer_async(self, minibatch): subgraph = minibatch.sampled_subgraphs[0] seeds = minibatch._seed_nodes assert self.deduplicate + rank = thd.get_rank() if self.cooperative else 0 + world_size = thd.get_world_size() if self.cooperative else 1 minibatch._future = unique_and_compact_csc_formats( - subgraph.sampled_csc, seeds, async_op=True + subgraph.sampled_csc, seeds, rank, world_size, async_op=True ) return minibatch - @staticmethod - def _compact_per_layer_wait_future(minibatch): + def _compact_per_layer_wait_future(self, minibatch): subgraph = minibatch.sampled_subgraphs[0] seeds = minibatch._seed_nodes ( original_row_node_ids, compacted_csc_format, - _, + seeds_offsets, ) = minibatch._future.wait() delattr(minibatch, "_future") subgraph = SampledSubgraphImpl( @@ -521,6 +547,87 @@ def _compact_per_layer_wait_future(minibatch): ) minibatch._seed_nodes = original_row_node_ids minibatch.sampled_subgraphs[0] = subgraph + if self.cooperative: + subgraph._seeds_offsets = seeds_offsets + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_1(minibatch): + world_size = thd.get_world_size() + subgraph = minibatch.sampled_subgraphs[0] + seeds_offsets = subgraph._seeds_offsets + is_homogeneous = not isinstance(seeds_offsets, dict) + if is_homogeneous: + seeds_offsets = {"_N": seeds_offsets} + num_ntypes = len(seeds_offsets) + counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64) + for i, offsets in enumerate(seeds_offsets.values()): + counts_sent[ + torch.arange(i, world_size * num_ntypes, num_ntypes) + ] = offsets.diff() + counts_received = torch.empty_like(counts_sent) + subgraph._counts_future = all_to_all( + counts_received.split(num_ntypes), + counts_sent.split(num_ntypes), + async_op=True, + ) + subgraph._counts_sent = counts_sent + subgraph._counts_received = counts_received + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_2(minibatch): + world_size = thd.get_world_size() + seeds = minibatch._seed_nodes + is_homogenous = not isinstance(seeds, dict) + if is_homogenous: + seeds = {"_N": seeds} + subgraph = minibatch.sampled_subgraphs[0] + subgraph._counts_future.wait() + delattr(subgraph, "_counts_future") + num_ntypes = len(seeds.keys()) + seeds_received = {} + counts_sent = {} + counts_received = {} + for i, (ntype, typed_seeds) in enumerate(seeds.items()): + idx = torch.arange(i, world_size * num_ntypes, num_ntypes) + typed_counts_sent = subgraph._counts_sent[idx].tolist() + typed_counts_received = subgraph._counts_received[idx].tolist() + typed_seeds_received = typed_seeds.new_empty( + sum(typed_counts_received) + ) + all_to_all( + typed_seeds_received.split(typed_counts_received), + typed_seeds.split(typed_counts_sent), + ) + seeds_received[ntype] = typed_seeds_received + subgraph._seeds_received = seeds_received + subgraph._counts_sent = revert_to_homo(counts_sent) + subgraph._counts_received = revert_to_homo(counts_received) + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_3(minibatch): + subgraph = minibatch.sampled_subgraphs[0] + nodes = { + ntype: [typed_seeds] + for ntype, typed_seeds in subgraph._seeds_received.items() + } + minibatch._unique_future = unique_and_compact( + nodes, 0, 1, async_op=True + ) + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_4(minibatch): + unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait() + delattr(minibatch, "_unique_future") + inverse_seeds = { + ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items() + } + minibatch._seed_nodes = revert_to_homo(unique_seeds) + subgraph = minibatch.sampled_subgraphs[0] + subgraph._seed_inverse_ids = revert_to_homo(inverse_seeds) return minibatch @@ -541,6 +648,7 @@ def __init__( overlap_fetch, num_gpu_cached_edges, gpu_cache_threshold, + cooperative, asynchronous, layer_dependency=None, batch_dependency=None, @@ -561,6 +669,7 @@ def __init__( deduplicate, sampler, overlap_fetch, + cooperative=cooperative, asynchronous=asynchronous, layer_dependency=layer_dependency, ) @@ -637,6 +746,7 @@ def sampling_stages( deduplicate, sampler, overlap_fetch, + cooperative, asynchronous, layer_dependency, ): @@ -653,7 +763,9 @@ def sampling_stages( datapipe = datapipe.sample_per_layer( sampler, fanout, replace, prob_name, overlap_fetch, asynchronous ) - datapipe = datapipe.compact_per_layer(deduplicate, asynchronous) + datapipe = datapipe.compact_per_layer( + deduplicate, cooperative, asynchronous + ) if is_labor and not layer_dependency: datapipe = datapipe.transform(self._increment_seed) if is_labor: @@ -775,6 +887,7 @@ def __init__( overlap_fetch=False, num_gpu_cached_edges=0, gpu_cache_threshold=1, + cooperative=False, asynchronous=False, ): super().__init__( @@ -788,6 +901,7 @@ def __init__( overlap_fetch, num_gpu_cached_edges, gpu_cache_threshold, + cooperative, asynchronous, ) @@ -937,6 +1051,7 @@ def __init__( overlap_fetch=False, num_gpu_cached_edges=0, gpu_cache_threshold=1, + cooperative=False, asynchronous=False, ): super().__init__( @@ -950,6 +1065,7 @@ def __init__( overlap_fetch, num_gpu_cached_edges, gpu_cache_threshold, + cooperative, asynchronous, layer_dependency, batch_dependency, diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py index 88fdd3808714..556950982fb7 100644 --- a/python/dgl/graphbolt/subgraph_sampler.py +++ b/python/dgl/graphbolt/subgraph_sampler.py @@ -15,6 +15,8 @@ __all__ = [ "SubgraphSampler", + "all_to_all", + "revert_to_homo", ] @@ -41,10 +43,48 @@ def all_to_all(outputs, inputs, group=None, async_op=False): `rank, ..., world_size - 1, 0, ..., rank - 1` and we make it `0, world_size - 1` before calling `thd.all_to_all`.""" shift_fn = partial(_shift, group=group) - return thd.all_to_all(shift_fn(outputs), shift_fn(inputs), group, async_op) - - -def _revert_to_homo(d: dict): + outputs = shift_fn(list(outputs)) + inputs = shift_fn(list(inputs)) + if outputs[0].is_cuda: + return thd.all_to_all(outputs, inputs, group, async_op) + # gloo backend will be used. + outputs_single = torch.cat(outputs) + output_split_sizes = [o.size(0) for o in outputs] + handle = thd.all_to_all_single( + outputs_single, + torch.cat(inputs), + output_split_sizes, + [i.size(0) for i in inputs], + group, + async_op, + ) + temp_outputs = outputs_single.split(output_split_sizes) + + class _Waiter: + def __init__(self, handle, outputs, temp_outputs): + self.handle = handle + self.outputs = outputs + self.temp_outputs = temp_outputs + + def wait(self): + """Returns the stored value when invoked.""" + handle = self.handle + outputs = self.outputs + temp_outputs = self.temp_outputs + # Ensure that there is no leak + self.handle = self.outputs = self.temp_outputs = None + + if handle is not None: + handle.wait() + for output, temp_output in zip(outputs, temp_outputs): + output.copy_(temp_output) + + post_processor = _Waiter(handle, outputs, temp_outputs) + return post_processor if async_op else post_processor.wait() + + +def revert_to_homo(d: dict): + """Utility function to convert a dictionary that stores homogenous data.""" is_homogenous = len(d) == 1 and "_N" in d return list(d.values())[0] if is_homogenous else d @@ -148,45 +188,31 @@ def _wait_preprocess_future(minibatch, cooperative: bool): def _seeds_cooperative_exchange_1(minibatch, group=None): rank = thd.get_rank(group) world_size = thd.get_world_size(group) - assert world_size > 1 seeds = minibatch._seed_nodes is_homogeneous = not isinstance(seeds, dict) if is_homogeneous: seeds = {"_N": seeds} if minibatch._seeds_offsets is None: seeds_list = list(seeds.values()) - ( - sorted_seeds_list, - index_list, - offsets_list, - ) = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size) + result = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size) assert minibatch.compacted_seeds is None sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {} num_ntypes = len(seeds.keys()) for i, ( seed_type, - typed_sorted_seeds, - typed_index, - typed_offsets, - ) in enumerate( - zip( - seeds.keys(), - sorted_seeds_list, - index_list, - offsets_list, - ) - ): + (typed_sorted_seeds, typed_index, typed_offsets), + ) in enumerate(zip(seeds.keys(), result)): sorted_seeds[seed_type] = typed_sorted_seeds sorted_compacted[seed_type] = typed_index - sorted_offsets[seed_type] = typed_offsets.tolist() + sorted_offsets[seed_type] = typed_offsets minibatch._seed_nodes = sorted_seeds - minibatch.compacted_seeds = sorted_compacted + minibatch.compacted_seeds = revert_to_homo(sorted_compacted) minibatch._seeds_offsets = sorted_offsets else: minibatch._seeds_offsets = {"_N": minibatch._seeds_offsets} counts_sent = torch.empty(world_size * num_ntypes, dtype=torch.int64) - for i, offsets in enumerate(minibatch._seeds_offsets[0].values()): + for i, offsets in enumerate(minibatch._seeds_offsets.values()): counts_sent[ torch.arange(i, world_size * num_ntypes, num_ntypes) ] = offsets.diff() @@ -208,7 +234,6 @@ def _seeds_cooperative_exchange_2(minibatch, group=None): seeds = minibatch._seed_nodes minibatch._counts_future.wait() delattr(minibatch, "_counts_future") - counts_received = minibatch._counts_received num_ntypes = len(seeds.keys()) seeds_received = {} counts_sent = {} @@ -226,15 +251,19 @@ def _seeds_cooperative_exchange_2(minibatch, group=None): group, ) seeds_received[ntype] = typed_seeds_received - minibatch._seed_nodes = _revert_to_homo(seeds_received) - minibatch._counts_sent = _revert_to_homo(counts_sent) - minibatch._counts_received = _revert_to_homo(counts_received) + minibatch._seed_nodes = seeds_received + minibatch._counts_sent = revert_to_homo(counts_sent) + minibatch._counts_received = revert_to_homo(counts_received) return minibatch @staticmethod def _seeds_cooperative_exchange_3(minibatch): + nodes = { + ntype: [typed_seeds] + for ntype, typed_seeds in minibatch._seed_nodes.items() + } minibatch._unique_future = unique_and_compact( - minibatch._seed_nodes, 0, 1, async_op=True + nodes, 0, 1, async_op=True ) return minibatch @@ -242,8 +271,11 @@ def _seeds_cooperative_exchange_3(minibatch): def _seeds_cooperative_exchange_4(minibatch): unique_seeds, inverse_seeds, _ = minibatch._unique_future.wait() delattr(minibatch, "_unique_future") - minibatch._seed_nodes = _revert_to_homo(unique_seeds) - minibatch._seed_inverse_ids = _revert_to_homo(inverse_seeds) + inverse_seeds = { + ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items() + } + minibatch._seed_nodes = revert_to_homo(unique_seeds) + minibatch._seed_inverse_ids = revert_to_homo(inverse_seeds) return minibatch def _sample(self, minibatch): diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py index 666ab352d29c..b02c820dd60d 100644 --- a/tests/python/pytorch/graphbolt/test_dataloader.py +++ b/tests/python/pytorch/graphbolt/test_dataloader.py @@ -1,4 +1,6 @@ +import os import unittest +from sys import platform import backend as F @@ -6,6 +8,7 @@ import dgl.graphbolt import pytest import torch +import torch.distributed as thd from dgl.graphbolt.datapipes import find_dps, traverse_dps @@ -63,6 +66,7 @@ def test_DataLoader(overlap_feature_fetch): @pytest.mark.parametrize("enable_feature_fetch", [True, False]) @pytest.mark.parametrize("overlap_feature_fetch", [True, False]) @pytest.mark.parametrize("overlap_graph_fetch", [True, False]) +@pytest.mark.parametrize("cooperative", [True, False]) @pytest.mark.parametrize("asynchronous", [True, False]) @pytest.mark.parametrize("num_gpu_cached_edges", [0, 1024]) @pytest.mark.parametrize("gpu_cache_threshold", [1, 3]) @@ -71,10 +75,23 @@ def test_gpu_sampling_DataLoader( enable_feature_fetch, overlap_feature_fetch, overlap_graph_fetch, + cooperative, asynchronous, num_gpu_cached_edges, gpu_cache_threshold, ): + if cooperative and not thd.is_initialized(): + # On Windows, the init method can only be file. + init_method = ( + f"file:///{os.path.join(os.getcwd(), 'dis_tempfile')}" + if platform == "win32" + else "tcp://127.0.0.1:12345" + ) + thd.init_process_group( + init_method=init_method, + world_size=1, + rank=0, + ) N = 40 B = 4 num_layers = 2 @@ -110,6 +127,7 @@ def test_gpu_sampling_DataLoader( "overlap_fetch": overlap_graph_fetch, "num_gpu_cached_edges": num_gpu_cached_edges, "gpu_cache_threshold": gpu_cache_threshold, + "cooperative": cooperative, "asynchronous": asynchronous, } if i != 0: @@ -118,7 +136,7 @@ def test_gpu_sampling_DataLoader( datapipe, graph, fanouts=[torch.LongTensor([2]) for _ in range(num_layers)], - **kwargs + **kwargs, ) if enable_feature_fetch: datapipe = dgl.graphbolt.FeatureFetcher( @@ -138,6 +156,11 @@ def test_gpu_sampling_DataLoader( bufferer_cnt += 2 * num_layers if asynchronous: bufferer_cnt += 2 * num_layers + 1 # _preprocess stage has 1. + if cooperative: + bufferer_cnt += 3 * num_layers + if cooperative: + # _preprocess stage and each sampling layer. + bufferer_cnt += 3 datapipe_graph = traverse_dps(dataloader) bufferers = find_dps( datapipe_graph, @@ -171,3 +194,5 @@ def test_gpu_sampling_DataLoader( if sampler_name == "LayerNeighborSampler": assert torch.equal(edge_feature, edge_feature_ref) assert len(list(dataloader)) == N // B + if thd.is_initialized(): + thd.destroy_process_group() From 864b023c47f9f43fb7c9219602ba1072278bc0e0 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Fri, 13 Sep 2024 00:46:41 -0400 Subject: [PATCH 04/10] [GraphBolt][CUDA] Add `CooperativeConv` and minor fixes. (#7797) --- python/dgl/graphbolt/impl/__init__.py | 1 + python/dgl/graphbolt/impl/cooperative_conv.py | 109 ++++++++++++++++++ python/dgl/graphbolt/impl/neighbor_sampler.py | 32 ++++- python/dgl/graphbolt/subgraph_sampler.py | 15 +++ .../pytorch/graphbolt/test_dataloader.py | 19 +++ 5 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 python/dgl/graphbolt/impl/cooperative_conv.py diff --git a/python/dgl/graphbolt/impl/__init__.py b/python/dgl/graphbolt/impl/__init__.py index 19fef44e462c..f4e53327c3ae 100644 --- a/python/dgl/graphbolt/impl/__init__.py +++ b/python/dgl/graphbolt/impl/__init__.py @@ -15,3 +15,4 @@ from .gpu_graph_cache import * from .cpu_feature_cache import * from .cpu_cached_feature import * +from .cooperative_conv import * diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py new file mode 100644 index 000000000000..28f11bc8b317 --- /dev/null +++ b/python/dgl/graphbolt/impl/cooperative_conv.py @@ -0,0 +1,109 @@ +"""Graphbolt cooperative convolution.""" +from typing import Dict, Union + +import torch + +from ..sampled_subgraph import SampledSubgraph +from ..subgraph_sampler import all_to_all, convert_to_hetero, revert_to_homo + +__all__ = ["CooperativeConvFunction", "CooperativeConv"] + + +class CooperativeConvFunction(torch.autograd.Function): + """Cooperative convolution operation from Cooperative Minibatching. + + Implements the `all-to-all` message passing algorithm + in Cooperative Minibatching, which was initially proposed in + `Deep Graph Library PR#4337`__ and + was later first fully described in + `Cooperative Minibatching in Graph Neural Networks + `__. + Cooperation between the GPUs eliminates duplicate work performed across the + GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when + performing GNN minibatching. This reduces the redundant computations across + GPUs at the expense of communication. + """ + + @staticmethod + def forward( + ctx, + subgraph: SampledSubgraph, + tensor: Union[torch.Tensor, Dict[str, torch.Tensor]], + ): + """Implements the forward pass.""" + counts_sent = convert_to_hetero(subgraph._counts_sent) + counts_received = convert_to_hetero(subgraph._counts_received) + seed_inverse_ids = convert_to_hetero(subgraph._seed_inverse_ids) + seed_sizes = convert_to_hetero(subgraph._seed_sizes) + ctx.save_for_backward( + counts_sent, counts_received, seed_inverse_ids, seed_sizes + ) + outs = {} + for ntype, typed_tensor in convert_to_hetero(tensor).items(): + out = typed_tensor.new_empty( + (sum(counts_sent[ntype]),) + typed_tensor.shape[1:] + ) + all_to_all( + torch.split(out, counts_sent[ntype]), + torch.split( + typed_tensor[seed_inverse_ids[ntype]], + counts_received[ntype], + ), + ) + outs[ntype] = out + return revert_to_homo(out) + + @staticmethod + def backward( + ctx, grad_output: Union[torch.Tensor, Dict[str, torch.Tensor]] + ): + """Implements the forward pass.""" + ( + counts_sent, + counts_received, + seed_inverse_ids, + seed_sizes, + ) = ctx.saved_tensors + outs = {} + for ntype, typed_grad_output in convert_to_hetero(grad_output).items(): + out = typed_grad_output.new_empty( + (sum(counts_received[ntype]),) + typed_grad_output.shape[1:] + ) + all_to_all( + torch.split(out, counts_received[ntype]), + torch.split(typed_grad_output, counts_sent[ntype]), + ) + i = out.new_empty(2, out.shape[0], dtype=torch.int64) + i[0] = torch.arange( + out.shape[0], device=typed_grad_output.device + ) # src + i[1] = seed_inverse_ids[ntype] # dst + coo = torch.sparse_coo_tensor( + i, 1, size=(seed_sizes[ntype], i.shape[1]) + ) + outs[ntype] = torch.sparse.mm(coo, out) + return None, revert_to_homo(outs) + + +class CooperativeConv(torch.nn.Module): + """Cooperative convolution operation from Cooperative Minibatching. + + Implements the `all-to-all` message passing algorithm + in Cooperative Minibatching, which was initially proposed in + `Deep Graph Library PR#4337`__ and + was later first fully described in + `Cooperative Minibatching in Graph Neural Networks + `__. + Cooperation between the GPUs eliminates duplicate work performed across the + GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when + performing GNN minibatching. This reduces the redundant computations across + GPUs at the expense of communication. + """ + + def forward( + self, + subgraph: SampledSubgraph, + x: Union[torch.Tensor, Dict[str, torch.Tensor]], + ): + """Implements the forward pass.""" + return CooperativeConvFunction.apply(subgraph, x) diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py index 352dedc067f2..7ddba6d7ccac 100644 --- a/python/dgl/graphbolt/impl/neighbor_sampler.py +++ b/python/dgl/graphbolt/impl/neighbor_sampler.py @@ -601,17 +601,18 @@ def _seeds_cooperative_exchange_2(minibatch): typed_seeds.split(typed_counts_sent), ) seeds_received[ntype] = typed_seeds_received - subgraph._seeds_received = seeds_received + counts_sent[ntype] = typed_counts_sent + counts_received[ntype] = typed_counts_received + minibatch._seed_nodes = seeds_received subgraph._counts_sent = revert_to_homo(counts_sent) subgraph._counts_received = revert_to_homo(counts_received) return minibatch @staticmethod def _seeds_cooperative_exchange_3(minibatch): - subgraph = minibatch.sampled_subgraphs[0] nodes = { ntype: [typed_seeds] - for ntype, typed_seeds in subgraph._seeds_received.items() + for ntype, typed_seeds in minibatch._seed_nodes.items() } minibatch._unique_future = unique_and_compact( nodes, 0, 1, async_op=True @@ -627,6 +628,11 @@ def _seeds_cooperative_exchange_4(minibatch): } minibatch._seed_nodes = revert_to_homo(unique_seeds) subgraph = minibatch.sampled_subgraphs[0] + sizes = { + ntype: typed_seeds.size(0) + for ntype, typed_seeds in unique_seeds.items() + } + subgraph._seed_sizes = revert_to_homo(sizes) subgraph._seed_inverse_ids = revert_to_homo(inverse_seeds) return minibatch @@ -831,6 +837,16 @@ class NeighborSampler(NeighborSamplerImpl): gpu_cache_threshold : int, optional Determines how many times a vertex needs to be accessed before its neighborhood ends up being cached on the GPU. + cooperative: bool, optional + Boolean indicating whether Cooperative Minibatching, which was initially + proposed in + `Deep Graph Library PR#4337`__ + and was later first fully described in + `Cooperative Minibatching in Graph Neural Networks + `__. Cooperation between the GPUs + eliminates duplicate work performed across the GPUs due to the + overlapping sampled k-hop neighborhoods of seed nodes when performing + GNN minibatching. asynchronous: bool Boolean indicating whether sampling and compaction stages should run in background threads to hide the latency of CPU GPU synchronization. @@ -986,6 +1002,16 @@ class LayerNeighborSampler(NeighborSamplerImpl): gpu_cache_threshold : int, optional Determines how many times a vertex needs to be accessed before its neighborhood ends up being cached on the GPU. + cooperative: bool, optional + Boolean indicating whether Cooperative Minibatching, which was initially + proposed in + `Deep Graph Library PR#4337`__ + and was later first fully described in + `Cooperative Minibatching in Graph Neural Networks + `__. Cooperation between the GPUs + eliminates duplicate work performed across the GPUs due to the + overlapping sampled k-hop neighborhoods of seed nodes when performing + GNN minibatching. asynchronous: bool Boolean indicating whether sampling and compaction stages should run in background threads to hide the latency of CPU GPU synchronization. diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py index 556950982fb7..dd5093ae5f69 100644 --- a/python/dgl/graphbolt/subgraph_sampler.py +++ b/python/dgl/graphbolt/subgraph_sampler.py @@ -16,6 +16,7 @@ __all__ = [ "SubgraphSampler", "all_to_all", + "convert_to_hetero", "revert_to_homo", ] @@ -89,6 +90,13 @@ def revert_to_homo(d: dict): return list(d.values())[0] if is_homogenous else d +def convert_to_hetero(item): + """Utility function to convert homogenous data to heterogenous with a single + node type.""" + is_heterogenous = isinstance(item, dict) + return item if is_heterogenous else {"_N": item} + + @functional_datapipe("sample_subgraph") class SubgraphSampler(MiniBatchTransformer): """A subgraph sampler used to sample a subgraph from a given set of nodes @@ -251,6 +259,8 @@ def _seeds_cooperative_exchange_2(minibatch, group=None): group, ) seeds_received[ntype] = typed_seeds_received + counts_sent[ntype] = typed_counts_sent + counts_received[ntype] = typed_counts_received minibatch._seed_nodes = seeds_received minibatch._counts_sent = revert_to_homo(counts_sent) minibatch._counts_received = revert_to_homo(counts_received) @@ -275,6 +285,11 @@ def _seeds_cooperative_exchange_4(minibatch): ntype: typed_inv[0] for ntype, typed_inv in inverse_seeds.items() } minibatch._seed_nodes = revert_to_homo(unique_seeds) + sizes = { + ntype: typed_seeds.size(0) + for ntype, typed_seeds in unique_seeds.items() + } + minibatch._seed_sizes = revert_to_homo(sizes) minibatch._seed_inverse_ids = revert_to_homo(inverse_seeds) return minibatch diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py index b02c820dd60d..5d5d44fd1eb7 100644 --- a/tests/python/pytorch/graphbolt/test_dataloader.py +++ b/tests/python/pytorch/graphbolt/test_dataloader.py @@ -6,6 +6,7 @@ import dgl import dgl.graphbolt +import dgl.graphbolt as gb import pytest import torch import torch.distributed as thd @@ -194,5 +195,23 @@ def test_gpu_sampling_DataLoader( if sampler_name == "LayerNeighborSampler": assert torch.equal(edge_feature, edge_feature_ref) assert len(list(dataloader)) == N // B + + if asynchronous and cooperative: + for minibatch in minibatches: + x = torch.ones((minibatch.node_ids().size(0), 1), device=F.ctx()) + for subgraph in minibatch.sampled_subgraphs: + x = gb.CooperativeConvFunction.apply(subgraph, x) + x, edge_index, size = subgraph.to_pyg(x) + x = x[0] + one = torch.ones( + edge_index.shape[1], dtype=x.dtype, device=x.device + ) + coo = torch.sparse_coo_tensor( + edge_index.flipud(), one, size=(size[1], size[0]) + ) + x = torch.sparse.mm(coo, x) + assert x.shape[0] == minibatch.seeds.shape[0] + assert x.shape[1] == 1 + if thd.is_initialized(): thd.destroy_process_group() From 55c224a304a0f1683c320b3219f6d0308163528b Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Sat, 14 Sep 2024 20:57:44 -0400 Subject: [PATCH 05/10] [GraphBolt][CUDA] Cooperative Minibatching - Feature Loading (#7798) --- python/dgl/graphbolt/feature_fetcher.py | 41 +++++++++++++++++-- python/dgl/graphbolt/impl/cooperative_conv.py | 2 +- .../pytorch/graphbolt/test_dataloader.py | 3 ++ 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/python/dgl/graphbolt/feature_fetcher.py b/python/dgl/graphbolt/feature_fetcher.py index 75dab4d54cd2..cf9d5f4104c2 100644 --- a/python/dgl/graphbolt/feature_fetcher.py +++ b/python/dgl/graphbolt/feature_fetcher.py @@ -8,6 +8,7 @@ from torch.utils.data import functional_datapipe from .base import etype_tuple_to_str +from .impl.cooperative_conv import CooperativeConvFunction from .minibatch_transformer import MiniBatchTransformer @@ -73,6 +74,16 @@ class FeatureFetcher(MiniBatchTransformer): If True, the feature fetcher will overlap the UVA feature fetcher operations with the rest of operations by using an alternative CUDA stream or utilizing asynchronous operations. Default is True. + cooperative: bool, optional + Boolean indicating whether Cooperative Minibatching, which was initially + proposed in + `Deep Graph Library PR#4337`__ + and was later first fully described in + `Cooperative Minibatching in Graph Neural Networks + `__. Cooperation between the GPUs + eliminates duplicate work performed across the GPUs due to the + overlapping sampled k-hop neighborhoods of seed nodes when performing + GNN minibatching. """ def __init__( @@ -82,6 +93,7 @@ def __init__( node_feature_keys=None, edge_feature_keys=None, overlap_fetch=True, + cooperative=False, ): datapipe = datapipe.mark_feature_fetcher_start() self.feature_store = feature_store @@ -113,9 +125,12 @@ def __init__( datapipe = datapipe.transform( partial(self._execute_stage, i) ).buffer(1) - super().__init__( - datapipe, self._identity if max_val == 0 else self._final_stage - ) + if max_val > 0: + datapipe = datapipe.transform(self._final_stage) + if cooperative: + datapipe = datapipe.transform(self._cooperative_exchange) + datapipe = datapipe.buffer() + super().__init__(datapipe) # A positive value indicates that the overlap optimization is enabled. self.max_num_stages = max_val @@ -145,6 +160,26 @@ def _final_stage(data): features[key] = value.wait() return data + def _cooperative_exchange(self, data): + subgraph = data.sampled_subgraphs[0] + is_heterogeneous = isinstance( + self.node_feature_keys, Dict + ) or isinstance(self.edge_feature_keys, Dict) + if is_heterogeneous: + node_features = {key: {} for key, _ in data.node_features.keys()} + for (key, ntype), feature in data.node_features.items(): + node_features[key][ntype] = feature + for key, feature in node_features.items(): + new_feature = CooperativeConvFunction.apply(subgraph, feature) + for ntype, tensor in new_feature.items(): + data.node_features[(key, ntype)] = tensor + else: + for key in data.node_features: + feature = data.node_features[key] + new_feature = CooperativeConvFunction.apply(subgraph, feature) + data.node_features[key] = new_feature + return data + def _read(self, data): """ Fill in the node/edge features field in data. diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py index 28f11bc8b317..cb3d39d4d980 100644 --- a/python/dgl/graphbolt/impl/cooperative_conv.py +++ b/python/dgl/graphbolt/impl/cooperative_conv.py @@ -57,7 +57,7 @@ def forward( def backward( ctx, grad_output: Union[torch.Tensor, Dict[str, torch.Tensor]] ): - """Implements the forward pass.""" + """Implements the backward pass.""" ( counts_sent, counts_received, diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py index 5d5d44fd1eb7..ee8f2b0cb9f5 100644 --- a/tests/python/pytorch/graphbolt/test_dataloader.py +++ b/tests/python/pytorch/graphbolt/test_dataloader.py @@ -146,6 +146,7 @@ def test_gpu_sampling_DataLoader( ["a", "b", "c"], ["d"], overlap_fetch=overlap_feature_fetch and i == 0, + cooperative=asynchronous and cooperative and i == 0, ) dataloaders.append(dgl.graphbolt.DataLoader(datapipe)) dataloader, dataloader2 = dataloaders @@ -159,6 +160,8 @@ def test_gpu_sampling_DataLoader( bufferer_cnt += 2 * num_layers + 1 # _preprocess stage has 1. if cooperative: bufferer_cnt += 3 * num_layers + if enable_feature_fetch: + bufferer_cnt += 1 # feature fetch has 1. if cooperative: # _preprocess stage and each sampling layer. bufferer_cnt += 3 From 75e410c354f050f1458d1d9b9e36d5804008f7e4 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Tue, 17 Sep 2024 16:15:19 -0400 Subject: [PATCH 06/10] [GraphBolt][CUDA] Fix Cooperative bug and additional tests. (#7800) --- .../src/cuda/extension/unique_and_compact_map.cu | 8 ++++++-- python/dgl/graphbolt/impl/cooperative_conv.py | 16 ++++++++++++---- .../impl/test_cooperative_minibatching_utils.py | 15 +++++++++++++-- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu index a36c63925d7f..3db918ee7fdb 100644 --- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu +++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu @@ -284,14 +284,18 @@ UniqueAndCompactBatchedHashMapBased( unique_ids_offsets_dev.data_ptr(); } at::cuda::CUDAEvent unique_ids_offsets_event; + unique_ids_offsets_event.record(); torch::optional index; if (part_ids) { + unique_ids_offsets_event.synchronize(); + const auto num_unique = + unique_ids_offsets.data_ptr()[num_batches]; + unique_ids = unique_ids.slice(0, 0, num_unique); + part_ids = part_ids->slice(0, 0, num_unique); std::tie( unique_ids, index, unique_ids_offsets, unique_ids_offsets_event) = cuda::RankSortImpl( unique_ids, *part_ids, unique_ids_offsets_dev, world_size); - } else { - unique_ids_offsets_event.record(); } auto mapped_ids = torch::empty(offsets_ptr[3 * num_batches], unique_ids.options()); diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py index cb3d39d4d980..8040dabd286c 100644 --- a/python/dgl/graphbolt/impl/cooperative_conv.py +++ b/python/dgl/graphbolt/impl/cooperative_conv.py @@ -35,8 +35,11 @@ def forward( counts_received = convert_to_hetero(subgraph._counts_received) seed_inverse_ids = convert_to_hetero(subgraph._seed_inverse_ids) seed_sizes = convert_to_hetero(subgraph._seed_sizes) - ctx.save_for_backward( - counts_sent, counts_received, seed_inverse_ids, seed_sizes + ctx.communication_variables = ( + counts_sent, + counts_received, + seed_inverse_ids, + seed_sizes, ) outs = {} for ntype, typed_tensor in convert_to_hetero(tensor).items(): @@ -63,7 +66,8 @@ def backward( counts_received, seed_inverse_ids, seed_sizes, - ) = ctx.saved_tensors + ) = ctx.communication_variables + delattr(ctx, "communication_variables") outs = {} for ntype, typed_grad_output in convert_to_hetero(grad_output).items(): out = typed_grad_output.new_empty( @@ -79,7 +83,11 @@ def backward( ) # src i[1] = seed_inverse_ids[ntype] # dst coo = torch.sparse_coo_tensor( - i, 1, size=(seed_sizes[ntype], i.shape[1]) + i, + torch.ones( + i.shape[1], dtype=grad_output.dtype, device=i.device + ), + size=(seed_sizes[ntype], i.shape[1]), ) outs[ntype] = torch.sparse.mm(coo, out) return None, revert_to_homo(outs) diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py index f85676578bd5..1de8669b0e08 100644 --- a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py +++ b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py @@ -19,9 +19,10 @@ @pytest.mark.parametrize("dtype", [torch.int32, torch.int64]) @pytest.mark.parametrize("rank", list(range(WORLD_SIZE))) def test_gpu_cached_feature_read_async(dtype, rank): + torch.manual_seed(7) nodes_list1 = [ - torch.randint(0, 11111111, [777], dtype=dtype, device=F.ctx()) - for i in range(10) + torch.randint(0, 2111111111, [777], dtype=dtype, device=F.ctx()) + for _ in range(10) ] nodes_list2 = [nodes.sort()[0] for nodes in nodes_list1] @@ -57,3 +58,13 @@ def test_gpu_cached_feature_read_async(dtype, rank): assert_equal( idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]] ) + + unique, compacted, offsets = gb.unique_and_compact( + nodes_list1[:1], rank, WORLD_SIZE + ) + + nodes1, idx1, offsets1 = res1[0] + + assert_equal(unique, nodes1) + assert_equal(compacted[0], idx1) + assert_equal(offsets, offsets1) From d3453c3f14132db06443ff067649ff8f73339c17 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Wed, 18 Sep 2024 22:04:12 -0400 Subject: [PATCH 07/10] [GraphBolt][CUDA] Add Cooperative Minibatching example. (#7801) --- .../pyg/multigpu/node_classification.py | 485 ++++++++++++++++++ 1 file changed, 485 insertions(+) create mode 100644 examples/graphbolt/pyg/multigpu/node_classification.py diff --git a/examples/graphbolt/pyg/multigpu/node_classification.py b/examples/graphbolt/pyg/multigpu/node_classification.py new file mode 100644 index 000000000000..d2884b6a87f9 --- /dev/null +++ b/examples/graphbolt/pyg/multigpu/node_classification.py @@ -0,0 +1,485 @@ +""" +This script demonstrates node classification with GraphSAGE on large graphs, +merging GraphBolt (GB) and PyTorch Geometric (PyG). GraphBolt efficiently manages +data loading for large datasets, crucial for mini-batch processing. Post data +loading, PyG's user-friendly framework takes over for training, showcasing seamless +integration with GraphBolt. This combination offers an efficient alternative to +traditional Deep Graph Library (DGL) methods, highlighting adaptability and +scalability in handling large-scale graph data for diverse real-world applications. + + + +Key Features: +- Implements the GraphSAGE model, a scalable GNN, for node classification on large graphs. +- Utilizes GraphBolt, an efficient framework for large-scale graph data processing. +- Integrates with PyTorch Geometric for building and training the GraphSAGE model. +- The script is well-documented, providing clear explanations at each step. + +This flowchart describes the main functional sequence of the provided example. +main: + +main +│ +├───> Load and preprocess dataset (GraphBolt) +│ │ +│ └───> Utilize GraphBolt's BuiltinDataset for dataset handling +│ +├───> Instantiate the SAGE model (PyTorch Geometric) +│ │ +│ └───> Define the GraphSAGE model architecture +│ +├───> Train the model +│ │ +│ ├───> Mini-Batch Processing with GraphBolt +│ │ │ +│ │ └───> Efficient handling of mini-batches using GraphBolt's utilities +│ │ +│ └───> Training Loop +│ │ +│ ├───> Forward and backward passes +│ │ +│ └───> Parameters optimization +│ +└───> Evaluate the model + │ + └───> Performance assessment on validation and test datasets + │ + └───> Accuracy and other relevant metrics calculation + + +""" + +import argparse +import os +import time + +import dgl.graphbolt as gb +import torch + +# For torch.compile until https://github.com/pytorch/pytorch/issues/121197 is +# resolved. +import torch._inductor.codecache + +torch._dynamo.config.cache_size_limit = 32 + +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn.functional as F +from torch_geometric.nn import SAGEConv +from tqdm import tqdm + + +def accuracy(out, labels): + assert out.ndim == 2 + assert out.size(0) == labels.size(0) + assert labels.ndim == 1 or (labels.ndim == 2 and labels.size(1) == 1) + labels = labels.flatten() + predictions = torch.argmax(out, 1) + return (labels == predictions).sum(dtype=torch.float64) / labels.size(0) + + +class GraphSAGE(torch.nn.Module): + ##################################################################### + # (HIGHLIGHT) Define the GraphSAGE model architecture. + # + # - This class inherits from `torch.nn.Module`. + # - Two convolutional layers are created using the SAGEConv class from PyG. + # - 'in_size', 'hidden_size', 'out_size' are the sizes of + # the input, hidden, and output features, respectively. + # - The forward method defines the computation performed at every call. + ##################################################################### + def __init__(self, in_size, hidden_size, out_size, n_layers, cooperative): + super(GraphSAGE, self).__init__() + self.layers = torch.nn.ModuleList() + sizes = [in_size] + [hidden_size] * (n_layers - 1) + [out_size] + for i in range(n_layers): + self.layers.append(SAGEConv(sizes[i], sizes[i + 1])) + self.hidden_size = hidden_size + self.out_size = out_size + self.cooperative = cooperative + + def forward(self, minibatch, x): + subgraphs = minibatch.sampled_subgraphs + h = x + for i, (layer, subgraph) in enumerate(zip(self.layers, subgraphs)): + ##################################################################### + # (HIGHLIGHT) Convert given features to be consumed by a PyG layer. + # + # PyG layers have two modes, bipartite and normal. We slice the + # given features to get src and dst features to use the PyG layers + # in the more efficient bipartite mode. + ##################################################################### + if i != 0 and self.cooperative: + h = gb.CooperativeConvFunction.apply(subgraph, h) + h, edge_index, size = subgraph.to_pyg(h) + h = layer(h, edge_index, size=size) + if i != len(subgraphs) - 1: + h = F.relu(h) + if self.cooperative: + h = gb.CooperativeConvFunction.apply(minibatch, h) + h = h[minibatch.compacted_seeds] + return h + + +def create_dataloader( + args, graph, features, itemset, batch_size, fanout, device, job +): + ##################################################################### + # (HIGHLIGHT) Create a data loader for efficiently loading graph data. + # + # - 'ItemSampler' samples mini-batches of node IDs from the dataset. + # - 'CopyTo' copies the fetched data to the specified device. + # - 'sample_neighbor' performs neighbor sampling on the graph. + # - 'FeatureFetcher' fetches node features based on the sampled subgraph. + + ##################################################################### + # Create a datapipe for mini-batch sampling with a specific neighbor fanout. + # Here, [10, 10, 10] specifies the number of neighbors sampled for each node at each layer. + # We're using `sample_neighbor` for consistency with DGL's sampling API. + # Note: GraphBolt offers additional sampling methods, such as `sample_layer_neighbor`, + # which could provide further optimization and efficiency for GNN training. + # Users are encouraged to explore these advanced features for potentially improved performance. + + # Initialize an ItemSampler to sample mini-batches from the dataset. + datapipe = gb.DistributedItemSampler( + itemset, + batch_size=batch_size, + shuffle=(job == "train"), + drop_last=(job == "train"), + drop_uneven_inputs=True, + ) + need_copy = True + # Copy the data to the specified device. + if args.graph_device != "cpu" and need_copy: + datapipe = datapipe.copy_to(device=device) + need_copy = False + # Sample neighbors for each node in the mini-batch. + datapipe = getattr(datapipe, args.sample_mode)( + graph, + fanout if job != "infer" else [-1], + overlap_fetch=args.overlap_graph_fetch, + num_gpu_cached_edges=args.num_gpu_cached_edges, + gpu_cache_threshold=args.gpu_graph_caching_threshold, + cooperative=args.cooperative, + asynchronous=args.graph_device != "cpu", + ) + # Copy the data to the specified device. + if args.feature_device != "cpu" and need_copy: + datapipe = datapipe.copy_to(device=device) + need_copy = False + # Fetch node features for the sampled subgraph. + datapipe = datapipe.fetch_feature( + features, + node_feature_keys=["feat"], + overlap_fetch=args.overlap_feature_fetch, + cooperative=args.cooperative, + ) + # Copy the data to the specified device. + if need_copy: + datapipe = datapipe.copy_to(device=device) + # Create and return a DataLoader to handle data loading. + return gb.DataLoader(datapipe, num_workers=args.num_workers) + + +def weighted_reduce(tensor, weight, dst=0): + ######################################################################## + # (HIGHLIGHT) Collect accuracy and loss values from sub-processes and + # obtain overall average values. + # + # `torch.distributed.reduce` is used to reduce tensors from all the + # sub-processes to a specified process, ReduceOp.SUM is used by default. + # + # Because the GPUs may have differing numbers of processed items, we + # perform a weighted mean to calculate the exact loss and accuracy. + ######################################################################## + dist.reduce(tensor=tensor, dst=dst) + weight = torch.tensor(weight, device=tensor.device) + dist.reduce(tensor=weight, dst=dst) + return tensor / weight + + +@torch.compile +def train_step(minibatch, optimizer, model, loss_fn, cooperative): + node_features = minibatch.node_features["feat"] + labels = minibatch.labels + optimizer.zero_grad() + out = model(minibatch, node_features) + loss = loss_fn(out, labels) + num_correct = accuracy(out, labels) * labels.size(0) + loss.backward() + optimizer.step() + return loss.detach(), num_correct, labels.size(0) + + +def train_helper( + rank, dataloader, model, optimizer, loss_fn, device, cooperative +): + model.train() # Set the model to training mode + total_loss = torch.zeros(1, device=device) # Accumulator for the total loss + # Accumulator for the total number of correct predictions + total_correct = torch.zeros(1, dtype=torch.float64, device=device) + total_samples = 0 # Accumulator for the total number of samples processed + num_batches = 0 # Counter for the number of mini-batches processed + start = time.time() + for minibatch in tqdm(dataloader, "Training") if rank == 0 else dataloader: + loss, num_correct, num_samples = train_step( + minibatch, optimizer, model, loss_fn, cooperative + ) + total_loss += loss + total_correct += num_correct + total_samples += num_samples + num_batches += 1 + train_loss = weighted_reduce(total_loss, num_batches) + train_acc = weighted_reduce(total_correct, total_samples) + end = time.time() + return train_loss, train_acc, end - start + + +def train(args, rank, train_dataloader, valid_dataloader, model, device): + ##################################################################### + # (HIGHLIGHT) Train the model for one epoch. + # + # - Iterates over the data loader, fetching mini-batches of graph data. + # - For each mini-batch, it performs a forward pass, computes loss, and + # updates the model parameters. + # - The function returns the average loss and accuracy for the epoch. + # + # Parameters: + # model: The GraphSAGE model. + # dataloader: DataLoader that provides mini-batches of graph data. + # optimizer: Optimizer used for updating model parameters. + # loss_fn: Loss function used for training. + # device: The device (CPU/GPU) to run the training on. + ##################################################################### + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) + loss_fn = torch.nn.CrossEntropyLoss() + + for epoch in range(args.epochs): + train_loss, train_acc, duration = train_helper( + rank, + train_dataloader, + model, + optimizer, + loss_fn, + device, + args.cooperative, + ) + val_acc = evaluate(rank, model, valid_dataloader, device) + if rank == 0: + print( + f"Epoch {epoch:02d}, Loss: {train_loss.item():.4f}, " + f"Approx. Train: {train_acc.item():.4f}, " + f"Approx. Val: {val_acc.item():.4f}, " + f"Time: {duration}s" + ) + + +@torch.compile +def evaluate_step(minibatch, model): + node_features = minibatch.node_features["feat"] + labels = minibatch.labels + out = model(minibatch, node_features) + num_correct = accuracy(out, labels) * labels.size(0) + return num_correct, labels.size(0) + + +@torch.no_grad() +def evaluate(rank, model, dataloader, device): + model.eval() + total_correct = torch.zeros(1, dtype=torch.float64, device=device) + total_samples = 0 + for minibatch in ( + tqdm(dataloader, "Evaluating") if rank == 0 else dataloader + ): + num_correct, num_samples = evaluate_step(minibatch, model) + total_correct += num_correct + total_samples += num_samples + + return weighted_reduce(total_correct, total_samples) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Which dataset are you going to use?" + ) + parser.add_argument( + "--epochs", type=int, default=10, help="Number of training epochs." + ) + parser.add_argument( + "--lr", + type=float, + default=0.003, + help="Learning rate for optimization.", + ) + parser.add_argument( + "--batch-size", type=int, default=1024, help="Batch size for training." + ) + parser.add_argument( + "--num-workers", + type=int, + default=0, + help="Number of workers for data loading.", + ) + parser.add_argument( + "--dataset", + type=str, + default="ogbn-products", + choices=[ + "ogbn-arxiv", + "ogbn-products", + "ogbn-papers100M", + "igb-hom-tiny", + "igb-hom-small", + "igb-hom-medium", + "igb-hom-large", + "igb-hom", + ], + help="The dataset we can use for node classification example. Currently" + " ogbn-products, ogbn-arxiv, ogbn-papers100M and" + " igb-hom-[tiny|small|medium|large] and igb-hom datasets are supported.", + ) + parser.add_argument( + "--fanout", + type=str, + default="10,10,10", + help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)" + " identical with the number of layers in your model. Default: 10,10,10", + ) + parser.add_argument( + "--mode", + default="pinned-pinned-cuda", + choices=[ + "pinned-pinned-cuda", + "cuda-pinned-cuda", + "cuda-cuda-cuda", + ], + help="Graph storage - feature storage - Train device: 'cpu' for CPU and RAM," + " 'pinned' for pinned memory in RAM, 'cuda' for GPU and GPU memory.", + ) + parser.add_argument( + "--gpu-cache-size", + type=int, + default=0, + help="The capacity of the GPU cache in bytes.", + ) + parser.add_argument( + "--sample-mode", + default="sample_neighbor", + choices=["sample_neighbor", "sample_layer_neighbor"], + help="The sampling function when doing layerwise sampling.", + ) + parser.add_argument( + "--num-gpu-cached-edges", + type=int, + default=0, + help="The number of edges to be cached from the graph on the GPU.", + ) + parser.add_argument( + "--gpu-graph-caching-threshold", + type=int, + default=1, + help="The number of accesses after which a vertex neighborhood will be cached.", + ) + parser.add_argument("--precision", type=str, default="high") + parser.add_argument( + "--cooperative", + action="store_true", + help="Enables Cooperative Minibatching from arXiv:2310.12403.", + ) + return parser.parse_args() + + +def run(rank, world_size, args, dataset): + # Set up multiprocessing environment. + torch.cuda.set_device(rank) + dist.init_process_group( + init_method="tcp://127.0.0.1:12345", + rank=rank, + world_size=world_size, + ) + + print(f"Training in {args.mode} mode.") + args.graph_device, args.feature_device, args.device = args.mode.split("-") + args.overlap_feature_fetch = args.feature_device == "pinned" + args.overlap_graph_fetch = args.graph_device == "pinned" + + # Move the dataset to the selected storage. + graph = ( + dataset.graph.pin_memory_() + if args.graph_device == "pinned" + else dataset.graph.to(args.graph_device) + ) + features = ( + dataset.feature.pin_memory_() + if args.feature_device == "pinned" + else dataset.feature.to(args.feature_device) + ) + + train_set = dataset.tasks[0].train_set + valid_set = dataset.tasks[0].validation_set + args.fanout = list(map(int, args.fanout.split(","))) + + num_classes = dataset.tasks[0].metadata["num_classes"] + + if args.gpu_cache_size > 0 and args.feature_device != "cuda": + features._features[("node", None, "feat")] = gb.gpu_cached_feature( + features._features[("node", None, "feat")], + args.gpu_cache_size, + ) + + train_dataloader, valid_dataloader = ( + create_dataloader( + args, + graph=graph, + features=features, + itemset=itemset, + batch_size=args.batch_size, + fanout=args.fanout, + device=args.device, + job=job, + ) + for itemset, job in zip([train_set, valid_set], ["train", "evaluate"]) + ) + + in_channels = features.size("node", None, "feat")[0] + hidden_channels = 256 + model = GraphSAGE( + in_channels, + hidden_channels, + num_classes, + len(args.fanout), + args.cooperative, + ).to(args.device) + assert len(args.fanout) == len(model.layers) + model = torch.nn.parallel.DistributedDataParallel(model) + + train(args, rank, train_dataloader, valid_dataloader, model, args.device) + + dist.destroy_process_group() + + +if __name__ == "__main__": + args = parse_args() + if not torch.cuda.is_available(): + print("Multi-GPU training requires GPUs.") + exit(0) + + torch.set_float32_matmul_precision(args.precision) + + # Load and preprocess dataset. + print("Loading data...") + dataset = gb.BuiltinDataset(args.dataset).load() + + world_size = torch.cuda.device_count() + + # Thread limiting to avoid resource competition. + os.environ["OMP_NUM_THREADS"] = str(mp.cpu_count() // 2 // world_size) + + mp.set_sharing_strategy("file_system") + mp.spawn( + run, + args=(world_size, args, dataset), + nprocs=world_size, + join=True, + ) From 3bc8e228fc87bb371d42cf97e8e8eb0159c5f8ae Mon Sep 17 00:00:00 2001 From: Wenxuan Cao <90617523+CfromBU@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:05:11 +0800 Subject: [PATCH 08/10] [DistGB] enable dist partition pipeline to save FusedCSCSamplingGraph partition directly (#7728) Co-authored-by: Ubuntu Co-authored-by: Ubuntu Co-authored-by: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> --- python/dgl/distributed/partition.py | 52 +- tests/tools/test_dist_partition_graphbolt.py | 1023 ++++++++++++++++++ tools/dispatch_data.py | 28 + tools/distpartitioning/convert_partition.py | 409 ++++++- tools/distpartitioning/data_proc_pipeline.py | 25 +- tools/distpartitioning/data_shuffle.py | 27 +- tools/distpartitioning/utils.py | 44 +- 7 files changed, 1520 insertions(+), 88 deletions(-) create mode 100644 tests/tools/test_dist_partition_graphbolt.py diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py index 079ed8806a96..48005ffb4d27 100644 --- a/python/dgl/distributed/partition.py +++ b/python/dgl/distributed/partition.py @@ -1600,8 +1600,6 @@ def _save_graph_gb(part_config, part_id, csc_graph): def cast_various_to_minimum_dtype_gb( - graph, - part_meta, num_parts, indptr, indices, @@ -1610,25 +1608,43 @@ def cast_various_to_minimum_dtype_gb( ntypes, node_attributes, edge_attributes, + part_meta=None, + graph=None, + edge_count=None, + node_count=None, + tot_edge_count=None, + tot_node_count=None, ): """Cast various data to minimum dtype.""" + if graph is not None: + assert part_meta is not None + tot_edge_count = graph.num_edges() + tot_node_count = graph.num_nodes() + node_count = part_meta["num_nodes"] + edge_count = part_meta["num_edges"] + else: + assert tot_edge_count is not None + assert tot_node_count is not None + assert edge_count is not None + assert node_count is not None + # Cast 1: indptr. - indptr = _cast_to_minimum_dtype(graph.num_edges(), indptr) + indptr = _cast_to_minimum_dtype(tot_edge_count, indptr) # Cast 2: indices. - indices = _cast_to_minimum_dtype(graph.num_nodes(), indices) + indices = _cast_to_minimum_dtype(tot_node_count, indices) # Cast 3: type_per_edge. type_per_edge = _cast_to_minimum_dtype( len(etypes), type_per_edge, field=ETYPE ) # Cast 4: node/edge_attributes. predicates = { - NID: part_meta["num_nodes"], + NID: node_count, "part_id": num_parts, NTYPE: len(ntypes), - EID: part_meta["num_edges"], + EID: edge_count, ETYPE: len(etypes), - DGL2GB_EID: part_meta["num_edges"], - GB_DST_ID: part_meta["num_nodes"], + DGL2GB_EID: edge_count, + GB_DST_ID: node_count, } for attributes in [node_attributes, edge_attributes]: for key in attributes: @@ -1779,16 +1795,16 @@ def gb_convert_single_dgl_partition( ) indptr, indices, type_per_edge = cast_various_to_minimum_dtype_gb( - graph, - part_meta, - num_parts, - indptr, - indices, - type_per_edge, - etypes, - ntypes, - node_attributes, - edge_attributes, + graph=graph, + part_meta=part_meta, + num_parts=num_parts, + indptr=indptr, + indices=indices, + type_per_edge=type_per_edge, + etypes=etypes, + ntypes=ntypes, + node_attributes=node_attributes, + edge_attributes=edge_attributes, ) csc_graph = gb.fused_csc_sampling_graph( diff --git a/tests/tools/test_dist_partition_graphbolt.py b/tests/tools/test_dist_partition_graphbolt.py new file mode 100644 index 000000000000..81c16f8809c3 --- /dev/null +++ b/tests/tools/test_dist_partition_graphbolt.py @@ -0,0 +1,1023 @@ +import json +import os +import tempfile + +import dgl +import dgl.backend as F +import dgl.graphbolt as gb + +import numpy as np +import pyarrow.parquet as pq +import pytest +import torch +from dgl.data.utils import load_graphs, load_tensors +from dgl.distributed.partition import ( + _etype_str_to_tuple, + _etype_tuple_to_str, + _get_inner_edge_mask, + _get_inner_node_mask, + load_partition, + RESERVED_FIELD_DTYPE, +) + +from distpartitioning import array_readwriter +from distpartitioning.utils import generate_read_list +from pytest_utils import create_chunked_dataset + + +def _verify_metadata_gb(gpb, g, num_parts, part_id, part_sizes): + """ + check list: + make sure the number of nodes and edges is correct. + make sure the number of parts is correct. + make sure the number of nodes and edges in each part is corrcet. + """ + assert gpb._num_nodes() == g.num_nodes() + assert gpb._num_edges() == g.num_edges() + + assert gpb.num_partitions() == num_parts + gpb_meta = gpb.metadata() + assert len(gpb_meta) == num_parts + assert len(gpb.partid2nids(part_id)) == gpb_meta[part_id]["num_nodes"] + assert len(gpb.partid2eids(part_id)) == gpb_meta[part_id]["num_edges"] + part_sizes.append( + (gpb_meta[part_id]["num_nodes"], gpb_meta[part_id]["num_edges"]) + ) + + +def _verify_local_id_gb(part_g, part_id, gpb): + """ + check list: + make sure the type of local id is correct. + make sure local id have a right order. + """ + nid = F.boolean_mask( + part_g.node_attributes[dgl.NID], + part_g.node_attributes["inner_node"], + ) + local_nid = gpb.nid2localnid(nid, part_id) + assert F.dtype(local_nid) in (F.int64, F.int32) + assert np.all(F.asnumpy(local_nid) == np.arange(0, len(local_nid))) + eid = F.boolean_mask( + part_g.edge_attributes[dgl.EID], + part_g.edge_attributes["inner_edge"], + ) + local_eid = gpb.eid2localeid(eid, part_id) + assert F.dtype(local_eid) in (F.int64, F.int32) + assert np.all(np.sort(F.asnumpy(local_eid)) == np.arange(0, len(local_eid))) + return local_nid, local_eid + + +def _verify_map_gb( + part_g, + part_id, + gpb, +): + """ + check list: + make sure the map node and its data type is correct. + """ + # Check the node map. + local_nodes = F.boolean_mask( + part_g.node_attributes[dgl.NID], + part_g.node_attributes["inner_node"], + ) + inner_node_index = F.nonzero_1d(part_g.node_attributes["inner_node"]) + mapping_nodes = gpb.partid2nids(part_id) + assert F.dtype(mapping_nodes) in (F.int32, F.int64) + assert np.all( + np.sort(F.asnumpy(local_nodes)) == np.sort(F.asnumpy(mapping_nodes)) + ) + assert np.all( + F.asnumpy(inner_node_index) == np.arange(len(inner_node_index)) + ) + + # Check the edge map. + + local_edges = F.boolean_mask( + part_g.edge_attributes[dgl.EID], + part_g.edge_attributes["inner_edge"], + ) + inner_edge_index = F.nonzero_1d(part_g.edge_attributes["inner_edge"]) + mapping_edges = gpb.partid2eids(part_id) + assert F.dtype(mapping_edges) in (F.int32, F.int64) + assert np.all( + np.sort(F.asnumpy(local_edges)) == np.sort(F.asnumpy(mapping_edges)) + ) + assert np.all( + F.asnumpy(inner_edge_index) == np.arange(len(inner_edge_index)) + ) + return local_nodes, local_edges + + +def _verify_local_and_map_id_gb( + part_g, + part_id, + gpb, + store_inner_node, + store_inner_edge, + store_eids, +): + """ + check list: + make sure local id are correct. + make sure mapping id are correct. + """ + if store_inner_node and store_inner_edge and store_eids: + _verify_local_id_gb(part_g, part_id, gpb) + _verify_map_gb(part_g, part_id, gpb) + + +def _get_part_IDs(part_g): + # These are partition-local IDs. + num_columns = part_g.csc_indptr.diff() + part_src_ids = part_g.indices + part_dst_ids = torch.arange(part_g.total_num_nodes).repeat_interleave( + num_columns + ) + # These are reshuffled global homogeneous IDs. + part_src_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_src_ids) + part_dst_ids = F.gather_row(part_g.node_attributes[dgl.NID], part_dst_ids) + return part_src_ids, part_dst_ids + + +def _verify_node_type_ID_gb(part_g, gpb): + """ + check list: + make sure ntype id have correct data type + """ + part_src_ids, part_dst_ids = _get_part_IDs(part_g) + # These are reshuffled per-type IDs. + src_ntype_ids, part_src_ids = gpb.map_to_per_ntype(part_src_ids) + dst_ntype_ids, part_dst_ids = gpb.map_to_per_ntype(part_dst_ids) + # `IdMap` is in int64 by default. + assert src_ntype_ids.dtype == F.int64 + assert dst_ntype_ids.dtype == F.int64 + + with pytest.raises(dgl.utils.internal.InconsistentDtypeException): + gpb.map_to_per_ntype(F.tensor([0], F.int32)) + with pytest.raises(dgl.utils.internal.InconsistentDtypeException): + gpb.map_to_per_etype(F.tensor([0], F.int32)) + return ( + part_src_ids, + part_dst_ids, + src_ntype_ids, + part_src_ids, + dst_ntype_ids, + ) + + +def _verify_orig_edge_IDs_gb( + g, + orig_nids, + orig_eids, + part_eids, + part_src_ids, + part_dst_ids, + src_ntype=None, + dst_ntype=None, + etype=None, +): + """ + check list: + make sure orig edge id are correct after + """ + if src_ntype is not None and dst_ntype is not None: + orig_src_nid = orig_nids[src_ntype] + orig_dst_nid = orig_nids[dst_ntype] + else: + orig_src_nid = orig_nids + orig_dst_nid = orig_nids + orig_src_ids = F.gather_row(orig_src_nid, part_src_ids) + orig_dst_ids = F.gather_row(orig_dst_nid, part_dst_ids) + if etype is not None: + orig_eids = orig_eids[etype] + orig_eids1 = F.gather_row(orig_eids, part_eids) + orig_eids2 = g.edge_ids(orig_src_ids, orig_dst_ids, etype=etype) + assert len(orig_eids1) == len(orig_eids2) + assert np.all(F.asnumpy(orig_eids1) == F.asnumpy(orig_eids2)) + + +def _verify_orig_IDs_gb( + part_g, + gpb, + g, + is_homo=False, + part_src_ids=None, + part_dst_ids=None, + src_ntype_ids=None, + dst_ntype_ids=None, + orig_nids=None, + orig_eids=None, +): + """ + check list: + make sure orig edge id are correct. + make sure hetero ntype id are correct. + """ + part_eids = part_g.edge_attributes[dgl.EID] + if is_homo: + _verify_orig_edge_IDs_gb( + g, orig_nids, orig_eids, part_eids, part_src_ids, part_dst_ids + ) + local_orig_nids = orig_nids[part_g.node_attributes[dgl.NID]] + local_orig_eids = orig_eids[part_g.edge_attributes[dgl.EID]] + part_g.node_attributes["feats"] = F.gather_row( + g.ndata["feats"], local_orig_nids + ) + part_g.edge_attributes["feats"] = F.gather_row( + g.edata["feats"], local_orig_eids + ) + else: + etype_ids, part_eids = gpb.map_to_per_etype(part_eids) + # `IdMap` is in int64 by default. + assert etype_ids.dtype == F.int64 + + # These are original per-type IDs. + for etype_id, etype in enumerate(g.canonical_etypes): + part_src_ids1 = F.boolean_mask(part_src_ids, etype_ids == etype_id) + src_ntype_ids1 = F.boolean_mask( + src_ntype_ids, etype_ids == etype_id + ) + part_dst_ids1 = F.boolean_mask(part_dst_ids, etype_ids == etype_id) + dst_ntype_ids1 = F.boolean_mask( + dst_ntype_ids, etype_ids == etype_id + ) + part_eids1 = F.boolean_mask(part_eids, etype_ids == etype_id) + assert np.all(F.asnumpy(src_ntype_ids1 == src_ntype_ids1[0])) + assert np.all(F.asnumpy(dst_ntype_ids1 == dst_ntype_ids1[0])) + src_ntype = g.ntypes[F.as_scalar(src_ntype_ids1[0])] + dst_ntype = g.ntypes[F.as_scalar(dst_ntype_ids1[0])] + + _verify_orig_edge_IDs_gb( + g, + orig_nids, + orig_eids, + part_eids1, + part_src_ids1, + part_dst_ids1, + src_ntype, + dst_ntype, + etype, + ) + + +def _verify_constructed_id_gb(part_sizes, gpb): + """ + verify the part id of each node by constructed nids. + check list: + make sure each node' part id and its type are corect + """ + node_map = [] + edge_map = [] + for part_i, (num_nodes, num_edges) in enumerate(part_sizes): + node_map.append(np.ones(num_nodes) * part_i) + edge_map.append(np.ones(num_edges) * part_i) + node_map = np.concatenate(node_map) + edge_map = np.concatenate(edge_map) + nid2pid = gpb.nid2partid(F.arange(0, len(node_map))) + assert F.dtype(nid2pid) in (F.int32, F.int64) + assert np.all(F.asnumpy(nid2pid) == node_map) + eid2pid = gpb.eid2partid(F.arange(0, len(edge_map))) + assert F.dtype(eid2pid) in (F.int32, F.int64) + assert np.all(F.asnumpy(eid2pid) == edge_map) + + +def _verify_IDs_gb( + g, + part_g, + part_id, + gpb, + part_sizes, + orig_nids, + orig_eids, + store_inner_node, + store_inner_edge, + store_eids, + is_homo, +): + # verify local id and mapping id + _verify_local_and_map_id_gb( + part_g, + part_id, + gpb, + store_inner_node, + store_inner_edge, + store_eids, + ) + + # Verify the mapping between the reshuffled IDs and the original IDs. + ( + part_src_ids, + part_dst_ids, + src_ntype_ids, + part_src_ids, + dst_ntype_ids, + ) = _verify_node_type_ID_gb(part_g, gpb) + + if store_eids: + _verify_orig_IDs_gb( + part_g, + gpb, + g, + part_src_ids=part_src_ids, + part_dst_ids=part_dst_ids, + src_ntype_ids=src_ntype_ids, + dst_ntype_ids=dst_ntype_ids, + orig_nids=orig_nids, + orig_eids=orig_eids, + is_homo=is_homo, + ) + _verify_constructed_id_gb(part_sizes, gpb) + + +def _collect_data_gb( + parts, + part_g, + gpbs, + gpb, + tot_node_feats, + node_feats, + tot_edge_feats, + edge_feats, + shuffled_labels, + shuffled_edata, + test_ntype, + test_etype, +): + if test_ntype != None: + shuffled_labels.append(node_feats[test_ntype + "/label"]) + shuffled_edata.append( + edge_feats[_etype_tuple_to_str(test_etype) + "/count"] + ) + else: + shuffled_labels.append(node_feats["_N/labels"]) + shuffled_edata.append(edge_feats["_N:_E:_N/feats"]) + parts.append(part_g) + gpbs.append(gpb) + tot_node_feats.append(node_feats) + tot_edge_feats.append(edge_feats) + + +def _verify_node_feats(g, part, gpb, orig_nids, node_feats, is_homo=False): + for ntype in g.ntypes: + ndata = ( + part.node_attributes + if isinstance(part, gb.FusedCSCSamplingGraph) + else part.ndata + ) + ntype_id = g.get_ntype_id(ntype) + inner_node_mask = _get_inner_node_mask( + part, + ntype_id, + (gpb if isinstance(part, gb.FusedCSCSamplingGraph) else None), + ) + inner_nids = F.boolean_mask(ndata[dgl.NID], inner_node_mask) + ntype_ids, inner_type_nids = gpb.map_to_per_ntype(inner_nids) + partid = gpb.nid2partid(inner_type_nids, ntype) + if is_homo: + assert np.all(F.asnumpy(ntype_ids) == ntype_id) + assert np.all(F.asnumpy(partid) == gpb.partid) + + if is_homo: + orig_id = orig_nids[inner_type_nids] + else: + orig_id = orig_nids[ntype][inner_type_nids] + local_nids = gpb.nid2localnid(inner_type_nids, gpb.partid, ntype) + + for name in g.nodes[ntype].data: + if name in [dgl.NID, "inner_node"]: + continue + true_feats = F.gather_row(g.nodes[ntype].data[name], orig_id) + ndata = F.gather_row(node_feats[ntype + "/" + name], local_nids) + assert np.all(F.asnumpy(ndata == true_feats)) + + +def _verify_edge_feats(g, part, gpb, orig_eids, edge_feats, is_homo=False): + for etype in g.canonical_etypes: + edata = ( + part.edge_attributes + if isinstance(part, gb.FusedCSCSamplingGraph) + else part.edata + ) + etype_id = g.get_etype_id(etype) + inner_edge_mask = _get_inner_edge_mask(part, etype_id) + inner_eids = F.boolean_mask(edata[dgl.EID], inner_edge_mask) + etype_ids, inner_type_eids = gpb.map_to_per_etype(inner_eids) + partid = gpb.eid2partid(inner_type_eids, etype) + assert np.all(F.asnumpy(etype_ids) == etype_id) + assert np.all(F.asnumpy(partid) == gpb.partid) + + if is_homo: + orig_id = orig_eids[inner_type_eids] + else: + orig_id = orig_eids[etype][inner_type_eids] + local_eids = gpb.eid2localeid(inner_type_eids, gpb.partid, etype) + + for name in g.edges[etype].data: + if name in [dgl.EID, "inner_edge"]: + continue + true_feats = F.gather_row(g.edges[etype].data[name], orig_id) + edata = F.gather_row( + edge_feats[_etype_tuple_to_str(etype) + "/" + name], + local_eids, + ) + assert np.all(F.asnumpy(edata == true_feats)) + + +def _verify_shuffled_labels_gb( + g, + shuffled_labels, + shuffled_edata, + orig_nids, + orig_eids, + test_ntype=None, + test_etype=None, +): + """ + check list: + make sure node data are correct. + make sure edge data are correct. + """ + shuffled_labels = F.asnumpy(F.cat(shuffled_labels, 0)) + shuffled_edata = F.asnumpy(F.cat(shuffled_edata, 0)) + orig_labels = np.zeros(shuffled_labels.shape, dtype=shuffled_labels.dtype) + orig_edata = np.zeros(shuffled_edata.shape, dtype=shuffled_edata.dtype) + + orig_nid = orig_nids if test_ntype is None else orig_nids[test_ntype] + orig_eid = orig_eids if test_etype is None else orig_eids[test_etype] + nlabel = ( + g.ndata["labels"] + if test_ntype is None + else g.nodes[test_ntype].data["label"] + ) + edata = ( + g.edata["feats"] + if test_etype is None + else g.edges[test_etype].data["count"] + ) + + orig_labels[F.asnumpy(orig_nid)] = shuffled_labels + orig_edata[F.asnumpy(orig_eid)] = shuffled_edata + assert np.all(orig_labels == F.asnumpy(nlabel)) + assert np.all(orig_edata == F.asnumpy(edata)) + + +def verify_graph_feats_gb( + g, + gpbs, + parts, + tot_node_feats, + tot_edge_feats, + orig_nids, + orig_eids, + shuffled_labels, + shuffled_edata, + test_ntype, + test_etype, + store_inner_node=False, + store_inner_edge=False, + store_eids=False, + is_homo=False, +): + """ + check list: + make sure the feats of nodes and edges are correct + """ + for part_id in range(len(parts)): + part = parts[part_id] + gpb = gpbs[part_id] + node_feats = tot_node_feats[part_id] + edge_feats = tot_edge_feats[part_id] + if store_inner_node: + _verify_node_feats( + g, + part, + gpb, + orig_nids, + node_feats, + is_homo=is_homo, + ) + if store_inner_edge and store_eids: + _verify_edge_feats( + g, + part, + gpb, + orig_eids, + edge_feats, + is_homo=is_homo, + ) + + _verify_shuffled_labels_gb( + g, + shuffled_labels, + shuffled_edata, + orig_nids, + orig_eids, + test_ntype, + test_etype, + ) + + +def _verify_graphbolt_attributes( + parts, store_inner_node, store_inner_edge, store_eids +): + """ + check list: + make sure arguments work. + """ + for part in parts: + assert store_inner_edge == ("inner_edge" in part.edge_attributes) + assert store_inner_node == ("inner_node" in part.node_attributes) + assert store_eids == (dgl.EID in part.edge_attributes) + + +def _verify_graphbolt_part( + g, + test_dir, + orig_nids, + orig_eids, + graph_name, + num_parts, + store_inner_node, + store_inner_edge, + store_eids, + part_config=None, + test_ntype=None, + test_etype=None, + is_homo=False, +): + """ + check list: + _verify_metadata_gb: + data type, ID's order and ID's number of edges and nodes + _verify_IDs_gb: + local id, mapping id,node type id, orig edge, hetero ntype id + verify_graph_feats_gb: + nodes and edges' feats + _verify_graphbolt_attributes: + arguments + """ + parts = [] + tot_node_feats = [] + tot_edge_feats = [] + shuffled_labels = [] + shuffled_edata = [] + part_sizes = [] + gpbs = [] + if part_config is None: + part_config = os.path.join(test_dir, f"{graph_name}.json") + # test each part + for part_id in range(num_parts): + part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition( + part_config, part_id, load_feats=True, use_graphbolt=True + ) + # verify metadata + _verify_metadata_gb( + gpb, + g, + num_parts, + part_id, + part_sizes, + ) + + # verify eid and nid + _verify_IDs_gb( + g, + part_g, + part_id, + gpb, + part_sizes, + orig_nids, + orig_eids, + store_inner_node, + store_inner_edge, + store_eids, + is_homo, + ) + + # collect shuffled data and parts + _collect_data_gb( + parts, + part_g, + gpbs, + gpb, + tot_node_feats, + node_feats, + tot_edge_feats, + edge_feats, + shuffled_labels, + shuffled_edata, + test_ntype, + test_etype, + ) + + # verify graph feats + verify_graph_feats_gb( + g, + gpbs, + parts, + tot_node_feats, + tot_edge_feats, + orig_nids, + orig_eids, + shuffled_labels=shuffled_labels, + shuffled_edata=shuffled_edata, + test_ntype=test_ntype, + test_etype=test_etype, + store_inner_node=store_inner_node, + store_inner_edge=store_inner_edge, + store_eids=store_eids, + is_homo=is_homo, + ) + + _verify_graphbolt_attributes( + parts, store_inner_node, store_inner_edge, store_eids + ) + + return parts + + +def _verify_hetero_graph_node_edge_num( + g, + parts, + store_inner_edge, + debug_mode, +): + """ + check list: + make sure edge type are correct. + make sure the number of nodes in each node type are correct. + make sure the number of nodes in each node type are correct. + """ + num_nodes = {ntype: 0 for ntype in g.ntypes} + num_edges = {etype: 0 for etype in g.canonical_etypes} + for part in parts: + edata = ( + part.edge_attributes + if isinstance(part, gb.FusedCSCSamplingGraph) + else part.edata + ) + if dgl.ETYPE in edata: + assert len(g.canonical_etypes) == len(F.unique(edata[dgl.ETYPE])) + if debug_mode or isinstance(part, dgl.DGLGraph): + for ntype in g.ntypes: + ntype_id = g.get_ntype_id(ntype) + inner_node_mask = _get_inner_node_mask(part, ntype_id) + num_inner_nodes = F.sum(F.astype(inner_node_mask, F.int64), 0) + num_nodes[ntype] += num_inner_nodes + if store_inner_edge or isinstance(part, dgl.DGLGraph): + for etype in g.canonical_etypes: + etype_id = g.get_etype_id(etype) + inner_edge_mask = _get_inner_edge_mask(part, etype_id) + num_inner_edges = F.sum(F.astype(inner_edge_mask, F.int64), 0) + num_edges[etype] += num_inner_edges + + # Verify the number of nodes are correct. + if debug_mode or isinstance(part, dgl.DGLGraph): + for ntype in g.ntypes: + print( + "node {}: {}, {}".format( + ntype, g.num_nodes(ntype), num_nodes[ntype] + ) + ) + assert g.num_nodes(ntype) == num_nodes[ntype] + # Verify the number of edges are correct. + if store_inner_edge or isinstance(part, dgl.DGLGraph): + for etype in g.canonical_etypes: + print( + "edge {}: {}, {}".format( + etype, g.num_edges(etype), num_edges[etype] + ) + ) + assert g.num_edges(etype) == num_edges[etype] + + +def _verify_edge_id_range_hetero( + g, + part, + eids, +): + """ + check list: + make sure inner_eids fall into a range. + make sure all edges are included. + """ + edata = ( + part.edge_attributes + if isinstance(part, gb.FusedCSCSamplingGraph) + else part.edata + ) + etype = ( + part.type_per_edge + if isinstance(part, gb.FusedCSCSamplingGraph) + else edata[dgl.ETYPE] + ) + eid = torch.arange(len(edata[dgl.EID])) + etype_arr = F.gather_row(etype, eid) + eid_arr = F.gather_row(edata[dgl.EID], eid) + for etype in g.canonical_etypes: + etype_id = g.get_etype_id(etype) + eids[etype].append(F.boolean_mask(eid_arr, etype_arr == etype_id)) + # Make sure edge Ids fall into a range. + inner_edge_mask = _get_inner_edge_mask(part, etype_id) + inner_eids = np.sort( + F.asnumpy(F.boolean_mask(edata[dgl.EID], inner_edge_mask)) + ) + assert np.all( + inner_eids == np.arange(inner_eids[0], inner_eids[-1] + 1) + ) + return eids + + +def _verify_node_id_range_hetero(g, part, nids): + """ + check list: + make sure inner nodes have Ids fall into a range. + """ + for ntype in g.ntypes: + ntype_id = g.get_ntype_id(ntype) + # Make sure inner nodes have Ids fall into a range. + inner_node_mask = _get_inner_node_mask(part, ntype_id) + inner_nids = F.boolean_mask( + part.node_attributes[dgl.NID], inner_node_mask + ) + assert np.all( + F.asnumpy( + inner_nids + == F.arange( + F.as_scalar(inner_nids[0]), + F.as_scalar(inner_nids[-1]) + 1, + ) + ) + ) + nids[ntype].append(inner_nids) + return nids + + +def _verify_graph_attributes_hetero( + g, + parts, + store_inner_edge, + store_inner_node, +): + """ + check list: + make sure edge ids fall into a range. + make sure inner nodes have Ids fall into a range. + make sure all nodes is included. + make sure all edges is included. + """ + nids = {ntype: [] for ntype in g.ntypes} + eids = {etype: [] for etype in g.canonical_etypes} + # check edge id. + if store_inner_edge or isinstance(parts[0], dgl.DGLGraph): + for part in parts: + # collect eids + eids = _verify_edge_id_range_hetero(g, part, eids) + for etype in eids: + eids_type = F.cat(eids[etype], 0) + uniq_ids = F.unique(eids_type) + # We should get all nodes. + assert len(uniq_ids) == g.num_edges(etype) + + # check node id. + if store_inner_node or isinstance(parts[0], dgl.DGLGraph): + for part in parts: + nids = _verify_node_id_range_hetero(g, part, nids) + for ntype in nids: + nids_type = F.cat(nids[ntype], 0) + uniq_ids = F.unique(nids_type) + # We should get all nodes. + assert len(uniq_ids) == g.num_nodes(ntype) + + +def _verify_hetero_graph( + g, + parts, + store_eids=False, + store_inner_edge=False, + store_inner_node=False, + debug_mode=False, +): + _verify_hetero_graph_node_edge_num( + g, + parts, + store_inner_edge=store_inner_edge, + debug_mode=debug_mode, + ) + if store_eids: + _verify_graph_attributes_hetero( + g, + parts, + store_inner_edge=store_inner_edge, + store_inner_node=store_inner_node, + ) + + +def _test_pipeline_graphbolt( + num_chunks, + num_parts, + world_size, + graph_formats=None, + data_fmt="numpy", + num_chunks_nodes=None, + num_chunks_edges=None, + num_chunks_node_data=None, + num_chunks_edge_data=None, + use_verify_partitions=False, + store_eids=True, + store_inner_edge=True, + store_inner_node=True, +): + if num_parts % world_size != 0: + # num_parts should be a multiple of world_size + return + + with tempfile.TemporaryDirectory() as root_dir: + g = create_chunked_dataset( + root_dir, + num_chunks, + data_fmt=data_fmt, + num_chunks_nodes=num_chunks_nodes, + num_chunks_edges=num_chunks_edges, + num_chunks_node_data=num_chunks_node_data, + num_chunks_edge_data=num_chunks_edge_data, + ) + graph_name = "test" + test_ntype = "paper" + test_etype = ("paper", "cites", "paper") + + # Step1: graph partition + in_dir = os.path.join(root_dir, "chunked-data") + output_dir = os.path.join(root_dir, "parted_data") + os.system( + "python3 tools/partition_algo/random_partition.py " + "--in_dir {} --out_dir {} --num_partitions {}".format( + in_dir, output_dir, num_parts + ) + ) + for ntype in ["author", "institution", "paper"]: + fname = os.path.join(output_dir, "{}.txt".format(ntype)) + with open(fname, "r") as f: + header = f.readline().rstrip() + assert isinstance(int(header), int) + + # Step2: data dispatch + partition_dir = os.path.join(root_dir, "parted_data") + out_dir = os.path.join(root_dir, "partitioned") + ip_config = os.path.join(root_dir, "ip_config.txt") + with open(ip_config, "w") as f: + for i in range(world_size): + f.write(f"127.0.0.{i + 1}\n") + + cmd = "python3 tools/dispatch_data.py " + cmd += f" --in-dir {in_dir} " + cmd += f" --partitions-dir {partition_dir} " + cmd += f" --out-dir {out_dir} " + cmd += f" --ip-config {ip_config} " + cmd += " --ssh-port 22 " + cmd += " --process-group-timeout 60 " + cmd += " --save-orig-nids " + cmd += " --save-orig-eids " + cmd += " --use-graphbolt " + cmd += f" --graph-formats {graph_formats} " if graph_formats else "" + + if store_eids: + cmd += " --store-eids " + if store_inner_edge: + cmd += " --store-inner-edge " + if store_inner_node: + cmd += " --store-inner-node " + os.system(cmd) + + # check if verify_partitions.py is used for validation. + if use_verify_partitions: + cmd = "python3 tools/verify_partitions.py " + cmd += f" --orig-dataset-dir {in_dir}" + cmd += f" --part-graph {out_dir}" + cmd += f" --partitions-dir {output_dir}" + os.system(cmd) + return + + # read original node/edge IDs + def read_orig_ids(fname): + orig_ids = {} + for i in range(num_parts): + ids_path = os.path.join(out_dir, f"part{i}", fname) + part_ids = load_tensors(ids_path) + for type, data in part_ids.items(): + if type not in orig_ids: + orig_ids[type] = data + else: + orig_ids[type] = torch.cat((orig_ids[type], data)) + return orig_ids + + orig_nids, orig_eids = None, None + orig_nids = read_orig_ids("orig_nids.dgl") + + orig_eids_str = read_orig_ids("orig_eids.dgl") + + orig_eids = {} + # transmit etype from string to tuple. + for etype, eids in orig_eids_str.items(): + orig_eids[_etype_str_to_tuple(etype)] = eids + + # load partitions and verify + part_config = os.path.join(out_dir, "metadata.json") + parts = _verify_graphbolt_part( + g, + root_dir, + orig_nids, + orig_eids, + graph_name, + num_parts, + store_inner_node, + store_inner_edge, + store_eids, + test_ntype=test_ntype, + test_etype=test_etype, + part_config=part_config, + is_homo=False, + ) + _verify_hetero_graph( + g, + parts, + store_eids=store_eids, + store_inner_edge=store_inner_edge, + ) + + +@pytest.mark.parametrize( + "num_chunks, num_parts, world_size", + [[4, 4, 4], [8, 4, 2], [8, 4, 4], [9, 6, 3], [11, 11, 1], [11, 4, 1]], +) +def test_pipeline_basics(num_chunks, num_parts, world_size): + _test_pipeline_graphbolt( + num_chunks, + num_parts, + world_size, + ) + _test_pipeline_graphbolt( + num_chunks, num_parts, world_size, use_verify_partitions=False + ) + + +@pytest.mark.parametrize("store_inner_node", [True, False]) +@pytest.mark.parametrize("store_inner_edge", [True, False]) +@pytest.mark.parametrize("store_eids", [True, False]) +def test_pipeline_attributes(store_inner_node, store_inner_edge, store_eids): + _test_pipeline_graphbolt( + 4, + 4, + 4, + store_inner_node=store_inner_node, + store_inner_edge=store_inner_edge, + store_eids=store_eids, + ) + + +@pytest.mark.parametrize( + "num_chunks, " + "num_parts, " + "world_size, " + "num_chunks_node_data, " + "num_chunks_edge_data", + [ + # Test cases where no. of chunks more than + # no. of partitions + [8, 4, 4, 8, 8], + [8, 4, 2, 8, 8], + [9, 7, 5, 9, 9], + [8, 8, 4, 8, 8], + # Test cases where no. of chunks smaller + # than no. of partitions + [7, 8, 4, 7, 7], + [1, 8, 4, 1, 1], + [1, 4, 4, 1, 1], + [3, 4, 4, 3, 3], + [1, 4, 2, 1, 1], + [3, 4, 2, 3, 3], + [1, 5, 3, 1, 1], + ], +) +def test_pipeline_arbitrary_chunks( + num_chunks, + num_parts, + world_size, + num_chunks_node_data, + num_chunks_edge_data, +): + + _test_pipeline_graphbolt( + num_chunks, + num_parts, + world_size, + num_chunks_node_data=num_chunks_node_data, + num_chunks_edge_data=num_chunks_edge_data, + ) + + +@pytest.mark.parametrize("data_fmt", ["numpy", "parquet"]) +def test_pipeline_feature_format(data_fmt): + _test_pipeline_graphbolt(4, 4, 4, data_fmt=data_fmt) diff --git a/tools/dispatch_data.py b/tools/dispatch_data.py index 3cf1d0fbf224..b2b54e51a6ec 100644 --- a/tools/dispatch_data.py +++ b/tools/dispatch_data.py @@ -75,6 +75,10 @@ def submit_jobs(args) -> str: argslist += "--log-level {} ".format(args.log_level) argslist += "--save-orig-nids " if args.save_orig_nids else "" argslist += "--save-orig-eids " if args.save_orig_eids else "" + argslist += "--use-graphbolt " if args.use_graphbolt else "" + argslist += "--store-eids " if args.store_eids else "" + argslist += "--store-inner-node " if args.store_inner_node else "" + argslist += "--store-inner-edge " if args.store_inner_edge else "" argslist += ( f"--graph-formats {args.graph_formats} " if args.graph_formats else "" ) @@ -159,6 +163,30 @@ def main(): action="store_true", help="Save original edge IDs into files", ) + parser.add_argument( + "--use-graphbolt", + action="store_true", + help="Use GraphBolt for distributed partition.", + ) + parser.add_argument( + "--store-inner-node", + action="store_true", + default=False, + help="Store inner nodes.", + ) + + parser.add_argument( + "--store-inner-edge", + action="store_true", + default=False, + help="Store inner edges.", + ) + parser.add_argument( + "--store-eids", + action="store_true", + default=False, + help="Store edge IDs.", + ) parser.add_argument( "--graph-formats", type=str, diff --git a/tools/distpartitioning/convert_partition.py b/tools/distpartitioning/convert_partition.py index a169589a3f97..5013b6d40f20 100644 --- a/tools/distpartitioning/convert_partition.py +++ b/tools/distpartitioning/convert_partition.py @@ -1,24 +1,25 @@ -import argparse +import copy import gc -import json import logging import os -import time import constants - import dgl +import dgl.backend as F +import dgl.graphbolt as gb import numpy as np -import pandas as pd -import pyarrow import torch as th +from dgl import EID, ETYPE, NID, NTYPE + +from dgl.distributed.constants import DGL2GB_EID, GB_DST_ID from dgl.distributed.partition import ( + _cast_to_minimum_dtype, _etype_str_to_tuple, _etype_tuple_to_str, + cast_various_to_minimum_dtype_gb, RESERVED_FIELD_DTYPE, ) -from pyarrow import csv -from utils import get_idranges, memory_snapshot, read_json +from utils import get_idranges, memory_snapshot def _get_unique_invidx(srcids, dstids, nids, low_mem=True): @@ -164,7 +165,202 @@ def _get_unique_invidx(srcids, dstids, nids, low_mem=True): return uniques, idxes, srcids, dstids -def create_dgl_object( +# Utility functions. +def _is_homogeneous(ntypes, etypes): + """Checks if the provided ntypes and etypes form a homogeneous graph.""" + return len(ntypes) == 1 and len(etypes) == 1 + + +def _coo2csc(src_ids, dst_ids): + src_ids, dst_ids = th.tensor(src_ids, dtype=th.int64), th.tensor( + dst_ids, dtype=th.int64 + ) + num_nodes = th.max(th.stack([src_ids, dst_ids], dim=0)).item() + 1 + dst, idx = dst_ids.sort() + indptr = th.searchsorted(dst, th.arange(num_nodes + 1)) + indices = src_ids[idx] + return indptr, indices, idx + + +def _create_edge_data(edgeid_offset, etype_ids, num_edges): + eid = th.arange( + edgeid_offset, + edgeid_offset + num_edges, + dtype=RESERVED_FIELD_DTYPE[dgl.EID], + ) + etype = th.as_tensor(etype_ids, dtype=RESERVED_FIELD_DTYPE[dgl.ETYPE]) + inner_edge = th.ones(num_edges, dtype=RESERVED_FIELD_DTYPE["inner_edge"]) + return eid, etype, inner_edge + + +def _create_node_data(ntype, uniq_ids, reshuffle_nodes, inner_nodes): + node_type = th.as_tensor(ntype, dtype=RESERVED_FIELD_DTYPE[dgl.NTYPE]) + node_id = th.as_tensor(uniq_ids[reshuffle_nodes]) + inner_node = th.as_tensor( + inner_nodes[reshuffle_nodes], + dtype=RESERVED_FIELD_DTYPE["inner_node"], + ) + return node_type, node_id, inner_node + + +def _compute_node_ntype( + global_src_id, global_dst_id, global_homo_nid, idx, reshuffle_nodes, id_map +): + global_ids = np.concatenate([global_src_id, global_dst_id, global_homo_nid]) + part_global_ids = global_ids[idx] + part_global_ids = part_global_ids[reshuffle_nodes] + ntype, per_type_ids = id_map(part_global_ids) + return ntype, per_type_ids + + +def _graph_orig_ids( + return_orig_nids, + return_orig_eids, + ntypes_map, + etypes_map, + node_attr, + edge_attr, + per_type_ids, + type_per_edge, + global_edge_id, +): + orig_nids = None + orig_eids = None + if return_orig_nids: + orig_nids = {} + for ntype, ntype_id in ntypes_map.items(): + mask = th.logical_and( + node_attr[dgl.NTYPE] == ntype_id, + node_attr["inner_node"], + ) + orig_nids[ntype] = th.as_tensor(per_type_ids[mask]) + if return_orig_eids: + orig_eids = {} + for etype, etype_id in etypes_map.items(): + mask = th.logical_and( + type_per_edge == etype_id, + edge_attr["inner_edge"], + ) + orig_eids[_etype_tuple_to_str(etype)] = th.as_tensor( + global_edge_id[mask] + ) + return orig_nids, orig_eids + + +def _create_edge_attr_gb( + part_local_dst_id, edgeid_offset, etype_ids, ntypes, etypes, etypes_map +): + edge_attr = {} + # create edge data in graph. + num_edges = len(part_local_dst_id) + ( + edge_attr[dgl.EID], + type_per_edge, + edge_attr["inner_edge"], + ) = _create_edge_data(edgeid_offset, etype_ids, num_edges) + assert "inner_edge" in edge_attr + + is_homo = _is_homogeneous(ntypes, etypes) + + edge_type_to_id = ( + {gb.etype_tuple_to_str(("_N", "_E", "_N")): 0} + if is_homo + else { + gb.etype_tuple_to_str(etype): etid + for etype, etid in etypes_map.items() + } + ) + return edge_attr, type_per_edge, edge_type_to_id + + +def _create_node_attr( + idx, + global_src_id, + global_dst_id, + global_homo_nid, + uniq_ids, + reshuffle_nodes, + id_map, + inner_nodes, +): + # compute per_type_ids and ntype for all the nodes in the graph. + ntype, per_type_ids = _compute_node_ntype( + global_src_id, + global_dst_id, + global_homo_nid, + idx, + reshuffle_nodes, + id_map, + ) + + # create node data in graph. + node_attr = {} + ( + node_attr[dgl.NTYPE], + node_attr[dgl.NID], + node_attr["inner_node"], + ) = _create_node_data(ntype, uniq_ids, reshuffle_nodes, inner_nodes) + return node_attr, per_type_ids + + +def remove_attr_gb( + edge_attr, node_attr, store_inner_node, store_inner_edge, store_eids +): + edata, ndata = copy.deepcopy(edge_attr), copy.deepcopy(node_attr) + if not store_inner_edge: + assert "inner_edge" in edata + edata.pop("inner_edge") + + if not store_eids: + assert dgl.EID in edata + edata.pop(dgl.EID) + + if not store_inner_node: + assert "inner_node" in ndata + ndata.pop("inner_node") + return edata, ndata + + +def _process_partition_gb( + node_attr, + edge_attr, + type_per_edge, + src_ids, + dst_ids, + sort_etypes, +): + """Preprocess partitions before saving: + 1. format data types. + 2. sort csc/csr by tag. + """ + for k, dtype in RESERVED_FIELD_DTYPE.items(): + if k in node_attr: + node_attr[k] = F.astype(node_attr[k], dtype) + if k in edge_attr: + edge_attr[k] = F.astype(edge_attr[k], dtype) + + indptr, indices, edge_ids = _coo2csc(src_ids, dst_ids) + if sort_etypes: + split_size = th.diff(indptr) + split_indices = th.split(type_per_edge, tuple(split_size), dim=0) + sorted_idxs = [] + for split_indice in split_indices: + sorted_idxs.append(split_indice.sort()[1]) + + sorted_idx = th.cat(sorted_idxs, dim=0) + sorted_idx = ( + th.repeat_interleave(indptr[:-1], split_size, dim=0) + sorted_idx + ) + + return indptr, indices[sorted_idx], edge_ids[sorted_idx] + + +def create_graph_object( + tot_node_count, + tot_edge_count, + node_count, + edge_count, + num_parts, schema, part_id, node_data, @@ -174,6 +370,8 @@ def create_dgl_object( edge_typecounts, return_orig_nids=False, return_orig_eids=False, + use_graphbolt=False, + **kwargs, ): """ This function creates dgl objects for a given graph partition, as in function @@ -223,6 +421,18 @@ def create_dgl_object( Parameters: ----------- + tot_node_count : int + the number of all nodes + tot_edge_count : int + the number of all edges + node_count : int + the number of nodes in partition + edge_count : int + the number of edges in partition + graph_formats : str + the format of graph + num_parts : int + the number of parts schame : json object json object created by reading the graph metadata json file part_id : int @@ -449,58 +659,134 @@ def create_dgl_object( nid_map[part_local_dst_id], ) - # create the graph here now. - part_graph = dgl.graph( - data=(part_local_src_id, part_local_dst_id), num_nodes=len(uniq_ids) - ) - part_graph.edata[dgl.EID] = th.arange( - edgeid_offset, - edgeid_offset + part_graph.num_edges(), - dtype=th.int64, - ) - part_graph.edata[dgl.ETYPE] = th.as_tensor( - etype_ids, dtype=RESERVED_FIELD_DTYPE[dgl.ETYPE] - ) - part_graph.edata["inner_edge"] = th.ones( - part_graph.num_edges(), dtype=RESERVED_FIELD_DTYPE["inner_edge"] - ) + """ + Creating attributes for graphbolt and DGLGraph is as follows. - # compute per_type_ids and ntype for all the nodes in the graph. - global_ids = np.concatenate([global_src_id, global_dst_id, global_homo_nid]) - part_global_ids = global_ids[idx] - part_global_ids = part_global_ids[reshuffle_nodes] - ntype, per_type_ids = id_map(part_global_ids) + node attributes: + this part is implemented in _create_node_attr. + compute the ntype and per type ids for each node with global node type id. + create ntype, nid and inner node with orig ntype and inner nodes + this part is shared by graphbolt and DGLGraph. - # continue with the graph creation - part_graph.ndata[dgl.NTYPE] = th.as_tensor( - ntype, dtype=RESERVED_FIELD_DTYPE[dgl.NTYPE] - ) - part_graph.ndata[dgl.NID] = th.as_tensor(uniq_ids[reshuffle_nodes]) - part_graph.ndata["inner_node"] = th.as_tensor( - inner_nodes[reshuffle_nodes], dtype=RESERVED_FIELD_DTYPE["inner_node"] + the attributes created for graphbolt are as follows: + + edge attributes: + this part is implemented in _create_edge_attr_gb. + create eid, type per edge and inner edge with edgeid_offset. + create edge_type_to_id with etypes_map. + + The process to remove extra attribute is implemented in remove_attr_gb. + the unused attributes like inner_node, inner_edge, eids will be removed following the arguments in kwargs. + edge_attr, node_attr are the variable that have removed extra attributes to construct csc_graph. + edata, ndata are the variable that reserve extra attributes to be used to generate orig_nid and orig_eid. + + the src_ids and dst_ids will be transformed into indptr and indices in _coo2csc. + + all variable mentioned above will be casted to minimum data type in cast_various_to_minimum_dtype_gb. + + orig_nids and orig_eids will be generated in _graph_orig_ids with ndata and edata. + """ + # create the graph here now. + ndata, per_type_ids = _create_node_attr( + idx, + global_src_id, + global_dst_id, + global_homo_nid, + uniq_ids, + reshuffle_nodes, + id_map, + inner_nodes, ) + if use_graphbolt: + edata, type_per_edge, edge_type_to_id = _create_edge_attr_gb( + part_local_dst_id, + edgeid_offset, + etype_ids, + ntypes, + etypes, + etypes_map, + ) - orig_nids = None - orig_eids = None - if return_orig_nids: - orig_nids = {} - for ntype, ntype_id in ntypes_map.items(): - mask = th.logical_and( - part_graph.ndata[dgl.NTYPE] == ntype_id, - part_graph.ndata["inner_node"], - ) - orig_nids[ntype] = th.as_tensor(per_type_ids[mask]) - if return_orig_eids: - orig_eids = {} - for etype, etype_id in etypes_map.items(): - mask = th.logical_and( - part_graph.edata[dgl.ETYPE] == etype_id, - part_graph.edata["inner_edge"], - ) - orig_eids[_etype_tuple_to_str(etype)] = th.as_tensor( - global_edge_id[mask] - ) + assert edata is not None + assert ndata is not None + + sort_etypes = len(etypes_map) > 1 + indptr, indices, csc_edge_ids = _process_partition_gb( + ndata, + edata, + type_per_edge, + part_local_src_id, + part_local_dst_id, + sort_etypes, + ) + edge_attr, node_attr = remove_attr_gb( + edge_attr=edata, node_attr=ndata, **kwargs + ) + edge_attr = { + attr: edge_attr[attr][csc_edge_ids] for attr in edge_attr.keys() + } + cast_various_to_minimum_dtype_gb( + node_count=node_count, + edge_count=edge_count, + tot_node_count=tot_node_count, + tot_edge_count=tot_edge_count, + num_parts=num_parts, + indptr=indptr, + indices=indices, + type_per_edge=type_per_edge, + etypes=etypes, + ntypes=ntypes, + node_attributes=node_attr, + edge_attributes=edge_attr, + ) + part_graph = gb.fused_csc_sampling_graph( + csc_indptr=indptr, + indices=indices, + node_type_offset=None, + type_per_edge=type_per_edge[csc_edge_ids], + node_attributes=node_attr, + edge_attributes=edge_attr, + node_type_to_id=ntypes_map, + edge_type_to_id=edge_type_to_id, + ) + else: + num_edges = len(part_local_dst_id) + part_graph = dgl.graph( + data=(part_local_src_id, part_local_dst_id), num_nodes=len(uniq_ids) + ) + # create edge data in graph. + ( + part_graph.edata[dgl.EID], + part_graph.edata[dgl.ETYPE], + part_graph.edata["inner_edge"], + ) = _create_edge_data(edgeid_offset, etype_ids, num_edges) + ndata, per_type_ids = _create_node_attr( + idx, + global_src_id, + global_dst_id, + global_homo_nid, + uniq_ids, + reshuffle_nodes, + id_map, + inner_nodes, + ) + for attr_name, node_attributes in ndata.items(): + part_graph.ndata[attr_name] = node_attributes + type_per_edge = part_graph.edata[dgl.ETYPE] + ndata, edata = part_graph.ndata, part_graph.edata + # get the original node ids and edge ids from original graph. + orig_nids, orig_eids = _graph_orig_ids( + return_orig_nids, + return_orig_eids, + ntypes_map, + etypes_map, + ndata, + edata, + per_type_ids, + type_per_edge, + global_edge_id, + ) return ( part_graph, node_map_val, @@ -523,6 +809,7 @@ def create_metadata_json( ntypes_map, etypes_map, output_dir, + use_graphbolt, ): """ Auxiliary function to create json file for the graph partition metadata @@ -549,6 +836,8 @@ def create_metadata_json( map between edge type(string) and edge_type_id(int) output_dir : string directory where the output files are to be stored + use_graphbolt : bool + whether to use graphbolt or not Returns: -------- @@ -572,10 +861,14 @@ def create_metadata_json( part_dir = "part" + str(part_id) node_feat_file = os.path.join(part_dir, "node_feat.dgl") edge_feat_file = os.path.join(part_dir, "edge_feat.dgl") - part_graph_file = os.path.join(part_dir, "graph.dgl") + if use_graphbolt: + part_graph_file = os.path.join(part_dir, "fused_csc_sampling_graph.pt") + else: + part_graph_file = os.path.join(part_dir, "graph.dgl") + part_graph_type = "part_graph_graphbolt" if use_graphbolt else "part_graph" part_metadata["part-{}".format(part_id)] = { "node_feats": node_feat_file, "edge_feats": edge_feat_file, - "part_graph": part_graph_file, + part_graph_type: part_graph_file, } return part_metadata diff --git a/tools/distpartitioning/data_proc_pipeline.py b/tools/distpartitioning/data_proc_pipeline.py index 4c249a34b6b2..e0159f55b9a1 100644 --- a/tools/distpartitioning/data_proc_pipeline.py +++ b/tools/distpartitioning/data_proc_pipeline.py @@ -94,6 +94,30 @@ def log_params(params): action="store_true", help="Save original edge IDs into files", ) + parser.add_argument( + "--use-graphbolt", + action="store_true", + help="Use GraphBolt for distributed partition.", + ) + parser.add_argument( + "--store-inner-node", + action="store_true", + default=False, + help="Store inner nodes.", + ) + + parser.add_argument( + "--store-inner-edge", + action="store_true", + default=False, + help="Store inner edges.", + ) + parser.add_argument( + "--store-eids", + action="store_true", + default=False, + help="Store edge IDs.", + ) parser.add_argument( "--graph-formats", default=None, @@ -101,7 +125,6 @@ def log_params(params): help="Save partitions in specified formats.", ) params = parser.parse_args() - # invoke the pipeline function numeric_level = getattr(logging, params.log_level.upper(), None) logging.basicConfig( diff --git a/tools/distpartitioning/data_shuffle.py b/tools/distpartitioning/data_shuffle.py index 7cba2cbeecda..6800064a2b0b 100644 --- a/tools/distpartitioning/data_shuffle.py +++ b/tools/distpartitioning/data_shuffle.py @@ -13,7 +13,7 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from convert_partition import create_dgl_object, create_metadata_json +from convert_partition import create_graph_object, create_metadata_json from dataset_utils import get_dataset from dist_lookup import DistLookupService from globalids import ( @@ -1121,7 +1121,6 @@ def gen_dist_partitions(rank, world_size, params): ) id_map = dgl.distributed.id_map.IdMap(global_nid_ranges) id_lookup.set_idMap(id_map) - # read input graph files and augment these datastructures with # appropriate information (global_nid and owner process) for node and edge data ( @@ -1315,6 +1314,8 @@ def prepare_local_data(src_data, local_part_id): ) local_node_data = prepare_local_data(node_data, local_part_id) local_edge_data = prepare_local_data(edge_data, local_part_id) + tot_node_count = sum(schema_map["num_nodes_per_type"]) + tot_edge_count = sum(schema_map["num_edges_per_type"]) ( graph_obj, ntypes_map_val, @@ -1323,7 +1324,12 @@ def prepare_local_data(src_data, local_part_id): etypes_map, orig_nids, orig_eids, - ) = create_dgl_object( + ) = create_graph_object( + tot_node_count, + tot_edge_count, + node_count, + edge_count, + params.num_parts, schema_map, rank + local_part_id * world_size, local_node_data, @@ -1334,8 +1340,12 @@ def prepare_local_data(src_data, local_part_id): schema_map[constants.STR_NUM_NODES_PER_TYPE], ), edge_typecounts, - params.save_orig_nids, - params.save_orig_eids, + return_orig_nids=params.save_orig_nids, + return_orig_eids=params.save_orig_eids, + use_graphbolt=params.use_graphbolt, + store_inner_node=params.store_inner_node, + store_inner_edge=params.store_inner_edge, + store_eids=params.store_eids, ) sort_etypes = len(etypes_map) > 1 local_node_features = prepare_local_data( @@ -1354,8 +1364,12 @@ def prepare_local_data(src_data, local_part_id): orig_eids, graph_formats, sort_etypes, + params.use_graphbolt, ) - memory_snapshot("DiskWriteDGLObjectsComplete: ", rank) + if params.use_graphbolt: + memory_snapshot("DiskWriteGrapgboltObjectsComplete: ", rank) + else: + memory_snapshot("DiskWriteDGLObjectsComplete: ", rank) # get the meta-data json_metadata = create_metadata_json( @@ -1369,6 +1383,7 @@ def prepare_local_data(src_data, local_part_id): ntypes_map, etypes_map, params.output, + params.use_graphbolt, ) output_meta_json[ "local-part-id-" + str(local_part_id * world_size + rank) diff --git a/tools/distpartitioning/utils.py b/tools/distpartitioning/utils.py index cdb984be3796..32292a843bc5 100644 --- a/tools/distpartitioning/utils.py +++ b/tools/distpartitioning/utils.py @@ -504,6 +504,20 @@ def write_edge_features(edge_features, edge_file): dgl.data.utils.save_tensors(edge_file, edge_features) +def write_graph_graghbolt(graph_file, graph_obj): + """ + Utility function to serialize FusedCSCSamplingGraph + + Parameters: + ----------- + graph_obj : FusedCSCSamplingGraph + FusedCSCSamplingGraph, as created in convert_partition.py, which is to be serialized + graph_file : string + File name in which graph object is serialized + """ + torch.save(graph_obj, graph_file) + + def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes): """ Utility function to serialize graph dgl objects @@ -519,9 +533,23 @@ def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes): sort_etypes : bool Whether to sort etypes in csc/csr. """ - dgl.distributed.partition._save_graphs( - graph_file, [graph_obj], formats, sort_etypes + dgl.distributed.partition.process_partitions( + graph_obj, formats, sort_etypes ) + dgl.save_graphs(graph_file, [graph_obj], formats=formats) + + +def _write_graph( + part_dir, graph_obj, formats=None, sort_etypes=None, use_graphbolt=False +): + if use_graphbolt: + write_graph_graghbolt( + os.path.join(part_dir, "fused_csc_sampling_graph.pt"), graph_obj + ) + else: + write_graph_dgl( + os.path.join(part_dir, "graph.dgl"), graph_obj, formats, sort_etypes + ) def write_dgl_objects( @@ -534,6 +562,7 @@ def write_dgl_objects( orig_eids, formats, sort_etypes, + use_graphbolt, ): """ Wrapper function to write graph, node/edge feature, original node/edge IDs. @@ -558,13 +587,18 @@ def write_dgl_objects( Save graph in formats. sort_etypes : bool Whether to sort etypes in csc/csr. + use_graphbolt : bool + Whether to use graphbolt or not. """ part_dir = output_dir + "/part" + str(part_id) os.makedirs(part_dir, exist_ok=True) - write_graph_dgl( - os.path.join(part_dir, "graph.dgl"), graph_obj, formats, sort_etypes + _write_graph( + part_dir, + graph_obj, + formats=formats, + sort_etypes=sort_etypes, + use_graphbolt=use_graphbolt, ) - if node_features != None: write_node_features( node_features, os.path.join(part_dir, "node_feat.dgl") From 31ad9b5a8d88f97742d2bb4f43809483fb57a71c Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Thu, 19 Sep 2024 14:57:42 -0400 Subject: [PATCH 09/10] [GraphBolt][CUDA] Fix Cooperative Minibatching bugs. (#7804) --- .../cuda/cooperative_minibatching_utils.cu | 12 ++++++-- .../src/cuda/cooperative_minibatching_utils.h | 29 ++++++++++--------- .../cuda/extension/unique_and_compact_map.cu | 8 +++-- python/dgl/graphbolt/impl/cooperative_conv.py | 6 ++-- .../test_cooperative_minibatching_utils.py | 10 +++---- 5 files changed, 37 insertions(+), 28 deletions(-) diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu index fb9858f6d559..e192de458650 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu @@ -19,6 +19,7 @@ * implementations in CUDA. */ #include +#include #include #include @@ -62,8 +63,7 @@ RankSortImpl( auto part_ids2 = part_ids.clone(); auto part_ids2_sorted = torch::empty_like(part_ids2); auto nodes_sorted = torch::empty_like(nodes); - auto index = ops::IndptrEdgeIdsImpl( - offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel()); + auto index = torch::arange(nodes.numel(), nodes.options()); auto index_sorted = torch::empty_like(index); return AT_DISPATCH_INDEX_TYPES( nodes.scalar_type(), "RankSortImpl", ([&] { @@ -100,8 +100,14 @@ RankSortImpl( index.data_ptr(), index_sorted.data_ptr(), nodes.numel(), num_batches, offsets_dev_ptr, offsets_dev_ptr + 1, 0, num_bits); + auto values = ops::IndptrEdgeIdsImpl( + offsets_dev, nodes.scalar_type(), torch::nullopt, nodes.numel()); + THRUST_CALL( + scatter, values.data_ptr(), + values.data_ptr() + values.numel(), + index_sorted.data_ptr(), index.data_ptr()); return std::make_tuple( - nodes_sorted, index_sorted, offsets, std::move(offsets_event)); + nodes_sorted, index, offsets, std::move(offsets_event)); })); } diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h index 45bd203f1f71..efe2b5b28bf1 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.h +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h @@ -42,21 +42,21 @@ torch::Tensor RankAssignment( /** * @brief Given node ids, the ranks they belong, the offsets to separate - * different node types and num_bits indicating the world size is <= 2^num_bits, - * returns node ids sorted w.r.t. the ranks that the given ids belong along with - * the original positions. + * different node types and world size, returns node ids sorted w.r.t. the ranks + * that the given ids belong along with their new positions. * * @param nodes Node id tensor to be mapped to a rank in [0, world_size). * @param part_ids Rank tensor the nodes belong to. * @param offsets_dev Offsets to separate different node types. * @param world_size World size, the total number of cooperating GPUs. * - * @return (sorted_nodes, original_positions, rank_offsets, rank_offsets_event), - * where the first one includes sorted nodes, the second contains original - * positions of the sorted nodes and the third contains the offsets of the - * sorted_nodes indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] - * contains nodes that belongs to the `i`th rank. Before accessing rank_offsets - * on the CPU, `rank_offsets_event.synchronize()` is required. + * @return (sorted_nodes, new_positions, rank_offsets, rank_offsets_event), + * where the first one includes sorted nodes, the second contains new positions + * of the given nodes, so that sorted_nodes[new_positions] == nodes, and the + * third contains the offsets of the sorted_nodes indicating + * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that + * belongs to the `i`th rank. Before accessing rank_offsets on the CPU, + * `rank_offsets_event.synchronize()` is required. */ std::tuple RankSortImpl( @@ -72,11 +72,12 @@ RankSortImpl( * @param rank Rank of the current GPU. * @param world_size World size, the total number of cooperating GPUs. * - * @return vector of (sorted_nodes, original_positions, rank_offsets), where the - * first one includes sorted nodes, the second contains original positions of - * the sorted nodes and the third contains the offsets of the sorted_nodes - * indicating sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes - * that belongs to the `i`th rank. + * @return vector of (sorted_nodes, new_positions, rank_offsets), where the + * first one includes sorted nodes, the second contains new positions of the + * given nodes, so that sorted_nodes[new_positions] == nodes, and the third + * contains the offsets of the sorted_nodes indicating + * sorted_nodes[rank_offsets[i]: rank_offsets[i + 1]] contains nodes that + * belongs to the `i`th rank. */ std::vector> RankSort( const std::vector& nodes_list, int64_t rank, diff --git a/graphbolt/src/cuda/extension/unique_and_compact_map.cu b/graphbolt/src/cuda/extension/unique_and_compact_map.cu index 3db918ee7fdb..56c4693bf5b0 100644 --- a/graphbolt/src/cuda/extension/unique_and_compact_map.cu +++ b/graphbolt/src/cuda/extension/unique_and_compact_map.cu @@ -99,8 +99,12 @@ __global__ void _MapIdsBatched( auto slot = map.find(key); auto new_id = slot->second; - if (index) new_id = index[new_id]; - mapped_ids[i] = new_id - unique_ids_offsets[batch_index]; + if (index) { + new_id = index[new_id]; + } else { + new_id -= unique_ids_offsets[batch_index]; + } + mapped_ids[i] = new_id; } i += stride; diff --git a/python/dgl/graphbolt/impl/cooperative_conv.py b/python/dgl/graphbolt/impl/cooperative_conv.py index 8040dabd286c..22c5ae316c71 100644 --- a/python/dgl/graphbolt/impl/cooperative_conv.py +++ b/python/dgl/graphbolt/impl/cooperative_conv.py @@ -78,10 +78,10 @@ def backward( torch.split(typed_grad_output, counts_sent[ntype]), ) i = out.new_empty(2, out.shape[0], dtype=torch.int64) - i[0] = torch.arange( + i[0] = seed_inverse_ids[ntype] # src + i[1] = torch.arange( out.shape[0], device=typed_grad_output.device - ) # src - i[1] = seed_inverse_ids[ntype] # dst + ) # dst coo = torch.sparse_coo_tensor( i, torch.ones( diff --git a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py index 1de8669b0e08..f88e011f4385 100644 --- a/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py +++ b/tests/python/pytorch/graphbolt/impl/test_cooperative_minibatching_utils.py @@ -18,7 +18,7 @@ ) @pytest.mark.parametrize("dtype", [torch.int32, torch.int64]) @pytest.mark.parametrize("rank", list(range(WORLD_SIZE))) -def test_gpu_cached_feature_read_async(dtype, rank): +def test_rank_sort_and_unique_and_compact(dtype, rank): torch.manual_seed(7) nodes_list1 = [ torch.randint(0, 2111111111, [777], dtype=dtype, device=F.ctx()) @@ -32,8 +32,8 @@ def test_gpu_cached_feature_read_async(dtype, rank): for i, ((nodes1, idx1, offsets1), (nodes2, idx2, offsets2)) in enumerate( zip(res1, res2) ): - assert_equal(nodes_list1[i], nodes1[idx1.sort()[1]]) - assert_equal(nodes_list2[i], nodes2[idx2.sort()[1]]) + assert_equal(nodes_list1[i], nodes1[idx1]) + assert_equal(nodes_list2[i], nodes2[idx2]) assert_equal(offsets1, offsets2) assert offsets1.is_pinned() and offsets2.is_pinned() @@ -50,14 +50,12 @@ def test_gpu_cached_feature_read_async(dtype, rank): for (nodes1, idx1, offsets1), (nodes4, idx4, offsets4) in zip(res1, res4): off1 = offsets1.tolist() off4 = offsets4.tolist() + assert_equal(nodes1[idx1], nodes4[idx4]) for i in range(WORLD_SIZE): j = (i - rank + WORLD_SIZE) % WORLD_SIZE assert_equal( nodes1[off1[j] : off1[j + 1]], nodes4[off4[i] : off4[i + 1]] ) - assert_equal( - idx1[off1[j] : off1[j + 1]], idx4[off4[i] : off4[i + 1]] - ) unique, compacted, offsets = gb.unique_and_compact( nodes_list1[:1], rank, WORLD_SIZE From 5ae64001f778d6da6d056f3da3e6b5349282ea00 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Thu, 19 Sep 2024 17:00:27 -0400 Subject: [PATCH 10/10] [GraphBolt][CUDA] `rank_sort_async` for Cooperative Minibatching. (#7805) --- .../cuda/cooperative_minibatching_utils.cu | 11 ++++++ .../src/cuda/cooperative_minibatching_utils.h | 7 ++++ graphbolt/src/python_binding.cc | 8 +++++ python/dgl/graphbolt/subgraph_sampler.py | 34 +++++++++++++------ .../pytorch/graphbolt/test_dataloader.py | 4 +-- 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.cu b/graphbolt/src/cuda/cooperative_minibatching_utils.cu index e192de458650..583e58629449 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.cu +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.cu @@ -25,6 +25,7 @@ #include #include +#include "../utils.h" #include "./common.h" #include "./cooperative_minibatching_utils.cuh" #include "./cooperative_minibatching_utils.h" @@ -144,5 +145,15 @@ std::vector> RankSort( return results; } +c10::intrusive_ptr>>> +RankSortAsync( + const std::vector& nodes_list, const int64_t rank, + const int64_t world_size) { + return async( + [=] { return RankSort(nodes_list, rank, world_size); }, + utils::is_on_gpu(nodes_list.at(0))); +} + } // namespace cuda } // namespace graphbolt diff --git a/graphbolt/src/cuda/cooperative_minibatching_utils.h b/graphbolt/src/cuda/cooperative_minibatching_utils.h index efe2b5b28bf1..c506c18d21c3 100644 --- a/graphbolt/src/cuda/cooperative_minibatching_utils.h +++ b/graphbolt/src/cuda/cooperative_minibatching_utils.h @@ -22,6 +22,7 @@ #define GRAPHBOLT_CUDA_COOPERATIVE_MINIBATCHING_UTILS_H_ #include +#include #include namespace graphbolt { @@ -83,6 +84,12 @@ std::vector> RankSort( const std::vector& nodes_list, int64_t rank, int64_t world_size); +c10::intrusive_ptr>>> +RankSortAsync( + const std::vector& nodes_list, const int64_t rank, + const int64_t world_size); + } // namespace cuda } // namespace graphbolt diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc index 35ab345c56f9..ed5e6273e7ac 100644 --- a/graphbolt/src/python_binding.cc +++ b/graphbolt/src/python_binding.cc @@ -59,6 +59,13 @@ TORCH_LIBRARY(graphbolt, m) { &Future>>:: Wait); + m.class_>>>( + "RankSortFuture") + .def( + "wait", + &Future>>::Wait); m.class_>>( "GpuGraphCacheQueryFuture") .def( @@ -198,6 +205,7 @@ TORCH_LIBRARY(graphbolt, m) { #ifdef GRAPHBOLT_USE_CUDA m.def("set_max_uva_threads", &cuda::set_max_uva_threads); m.def("rank_sort", &cuda::RankSort); + m.def("rank_sort_async", &cuda::RankSortAsync); #endif #ifdef HAS_IMPL_ABSTRACT_PYSTUB m.impl_abstract_pystub("dgl.graphbolt.base", "//dgl.graphbolt.base"); diff --git a/python/dgl/graphbolt/subgraph_sampler.py b/python/dgl/graphbolt/subgraph_sampler.py index dd5093ae5f69..88fc9c124de5 100644 --- a/python/dgl/graphbolt/subgraph_sampler.py +++ b/python/dgl/graphbolt/subgraph_sampler.py @@ -140,6 +140,9 @@ def __init__( if cooperative: datapipe = datapipe.transform(self._seeds_cooperative_exchange_1) datapipe = datapipe.buffer() + datapipe = datapipe.transform( + self._seeds_cooperative_exchange_1_wait_future + ).buffer() datapipe = datapipe.transform(self._seeds_cooperative_exchange_2) datapipe = datapipe.buffer() datapipe = datapipe.transform(self._seeds_cooperative_exchange_3) @@ -193,19 +196,32 @@ def _wait_preprocess_future(minibatch, cooperative: bool): return minibatch @staticmethod - def _seeds_cooperative_exchange_1(minibatch, group=None): - rank = thd.get_rank(group) - world_size = thd.get_world_size(group) + def _seeds_cooperative_exchange_1(minibatch): + rank = thd.get_rank() + world_size = thd.get_world_size() seeds = minibatch._seed_nodes is_homogeneous = not isinstance(seeds, dict) if is_homogeneous: seeds = {"_N": seeds} if minibatch._seeds_offsets is None: - seeds_list = list(seeds.values()) - result = torch.ops.graphbolt.rank_sort(seeds_list, rank, world_size) assert minibatch.compacted_seeds is None + minibatch._rank_sort_future = torch.ops.graphbolt.rank_sort_async( + list(seeds.values()), rank, world_size + ) + return minibatch + + @staticmethod + def _seeds_cooperative_exchange_1_wait_future(minibatch): + world_size = thd.get_world_size() + seeds = minibatch._seed_nodes + is_homogeneous = not isinstance(seeds, dict) + if is_homogeneous: + seeds = {"_N": seeds} + num_ntypes = len(seeds.keys()) + if minibatch._seeds_offsets is None: + result = minibatch._rank_sort_future.wait() + delattr(minibatch, "_rank_sort_future") sorted_seeds, sorted_compacted, sorted_offsets = {}, {}, {} - num_ntypes = len(seeds.keys()) for i, ( seed_type, (typed_sorted_seeds, typed_index, typed_offsets), @@ -229,7 +245,6 @@ def _seeds_cooperative_exchange_1(minibatch, group=None): minibatch._counts_future = all_to_all( counts_received.split(num_ntypes), counts_sent.split(num_ntypes), - group=group, async_op=True, ) minibatch._counts_sent = counts_sent @@ -237,8 +252,8 @@ def _seeds_cooperative_exchange_1(minibatch, group=None): return minibatch @staticmethod - def _seeds_cooperative_exchange_2(minibatch, group=None): - world_size = thd.get_world_size(group) + def _seeds_cooperative_exchange_2(minibatch): + world_size = thd.get_world_size() seeds = minibatch._seed_nodes minibatch._counts_future.wait() delattr(minibatch, "_counts_future") @@ -256,7 +271,6 @@ def _seeds_cooperative_exchange_2(minibatch, group=None): all_to_all( typed_seeds_received.split(typed_counts_received), typed_seeds.split(typed_counts_sent), - group, ) seeds_received[ntype] = typed_seeds_received counts_sent[ntype] = typed_counts_sent diff --git a/tests/python/pytorch/graphbolt/test_dataloader.py b/tests/python/pytorch/graphbolt/test_dataloader.py index ee8f2b0cb9f5..5843264516fc 100644 --- a/tests/python/pytorch/graphbolt/test_dataloader.py +++ b/tests/python/pytorch/graphbolt/test_dataloader.py @@ -163,8 +163,8 @@ def test_gpu_sampling_DataLoader( if enable_feature_fetch: bufferer_cnt += 1 # feature fetch has 1. if cooperative: - # _preprocess stage and each sampling layer. - bufferer_cnt += 3 + # _preprocess stage. + bufferer_cnt += 4 datapipe_graph = traverse_dps(dataloader) bufferers = find_dps( datapipe_graph,